#### Copyright (C) 2022 Sobhan Moradian Daghigh
#### Date: 2/2/2022

### Import Libraries

In [48]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import display, HTML
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Loading Datasets

In [13]:
def load_datasets(path):
    datasets = []
    print('Loading the datasets..\n')
    for root, dirs, files in os.walk(path):
        for i, file in enumerate(files):
            print(' |_  {}. {} loaded'.format(i + 1, file.split('.')[0]))
            datasets.append(pd.read_csv(os.path.join(root, file), names=['sentence', 'label'], sep='\t'))
    return datasets

In [80]:
datasets_name = ['Amazon', 'IMDB', 'Yelp']
datasets = load_datasets('./Datasets/')

Loading the datasets..

 |_  1. amazon loaded
 |_  2. imdb loaded
 |_  3. yelp loaded


In [4]:
# Amazon
datasets[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1000 non-null   object
 1   label     1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
# IMDB
datasets[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  748 non-null    object
 1   label     748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [6]:
# Yelp
datasets[2].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1000 non-null   object
 1   label     1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


### Train - Test - Split

In [46]:
def spliter(datasets):
    x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets = [], [], [], []
    for dataset in datasets:
        x_train, x_test, y_train, y_test = train_test_split(dataset.sentence, dataset.label, test_size=0.3, random_state=0)
        x_train_datasets.append(x_train)
        x_test_datasets.append(x_test)
        y_train_datasets.append(y_train)
        y_test_datasets.append(y_test)
    return x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets

In [49]:
x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets = spliter(datasets)

### Feature extraction using TF-IDF

In [57]:
def feature_extractor(dataset):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(dataset)
    return vectorizer

In [81]:
def get_top_frequent_features(vectorizer):
    df = pd.DataFrame({'words': vectorizer.vocabulary_.keys(), 'counts': vectorizer.vocabulary_.values()})
    print('{}:'.format(datasets_name[i]))
    display(df.sort_values(by=['counts'], ascending=False).head())

In [82]:
# Render DataFrames side by side by CSS override
css = """.output {flex-direction: row;}"""
HTML('<style>{}</style>'.format(css))

### Top 5 frequent words of each dataset

In [83]:
# Display
vectorizers = []
for i, dataset in enumerate(x_train_datasets):
    vectorizer = feature_extractor(dataset)
    get_top_frequent_features(vectorizer)
    vectorizers.append(vectorizer)

Amazon:


Unnamed: 0,words,counts
665,zero,1494
121,your,1493
117,you,1492
703,yet,1491
1475,yes,1490


IMDB:


Unnamed: 0,words,counts
1931,zombiez,2439
1107,zombie,2438
1447,zillion,2437
152,youtube,2436
2099,youthful,2435


Yelp:


Unnamed: 0,words,counts
264,zero,1648
629,yummy,1647
1429,yukon,1646
989,yucky,1645
366,yourself,1644


### Logistic Regression

In [84]:
print("Accuracy:")
for i, (vec, x_tr, x_ts, y_tr, y_ts) in enumerate(zip(vectorizers, x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets)):
    X_train = vectorizer.transform(x_tr)
    X_test = vectorizer.transform(x_ts)

    clf = LogisticRegression()
    clf.fit(X_train, y_tr)
    score = clf.score(X_test, y_ts)
    
    print(' |_  {}: {:.3f}'.format(datasets_name[i], score))

Accuracy:
 |_  Amazon: 0.753
 |_  IMDB: 0.684
 |_  Yelp: 0.793
