In [1]:
from glob import glob
from functions import create_sentiment_dataset, build_preprocess_pipeline
from tqdm import tqdm

from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression


In [2]:
files = glob('data/Multi Domain Sentiment/processed_acl/*/*')

In [3]:
df = create_sentiment_dataset(files)
df

Unnamed: 0,raw_text,label,text,folder,file
0,gaps:1 well:1 it_together:1 a_stack:1 the_cd:1...,negative,gaps well it together a stack the cd bott...,electronics,negative.review
1,save_your:2 steady_on:1 save:2 picture:1 your_...,negative,save your steady on save picture your mone...,electronics,negative.review
2,i:2 slightest_smudge:1 nice_for:1 errors:1 pla...,negative,i slightest smudge nice for errors player ...,electronics,negative.review
3,but_i:1 two:1 i:2 even:1 without:1 one:1 inexp...,negative,but i two i even without one inexpensive...,electronics,negative.review
4,failure:1 people_should:1 my_software:1 and_sa...,negative,failure people should my software and save ...,electronics,negative.review
...,...,...,...,...,...
27672,mass:1 specifically_references:1 science:1 rel...,negative,mass specifically references science releva...,books,unlabeled.review
27673,reviewer's_comments:1 to_pick:1 comments:1 and...,negative,reviewer's comments to pick comments and ul...,books,unlabeled.review
27674,x-ers:1 entry-level:1 can_dip:1 from_beginning...,positive,x-ers entry-level can dip from beginning v...,books,unlabeled.review
27675,your:1 well:1 around_for:1 you're_coming:1 for...,positive,your well around for you're coming for mor...,books,unlabeled.review


In [4]:
df.groupby(['folder','file']).size().unstack().fillna(0)

file,negative.review,positive.review,unlabeled.review
folder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
books,1000,1000,4465
dvd,1000,1000,3586
electronics,1000,1000,5681
kitchen,1000,1000,5945


## Train - Test splitting

In [5]:
train_data = df[df.file!='unlabeled.review'].reset_index(drop=True)
test_data = df[df.file=='unlabeled.review'].reset_index(drop=True)

# Classifier per category
## TF - IDF
### Logistic Regression

In [6]:
for cate in tqdm(train_data['folder'].unique()):
    
    cate_train_data = train_data[train_data['folder']==cate]
    cate_test_data = test_data[test_data['folder']==cate]
    
    # Preprocess the data
    tfidf_pipeline = build_preprocess_pipeline('tfidf').fit(cate_train_data['text'])
    X_train_tfidf_transformed = tfidf_pipeline.transform(cate_train_data['text'])
    
    logistic_estimator = LogisticRegression(n_jobs=-1, random_state=42, 
                                            class_weight=None, solver='saga',
                                            max_iter=1000, penalty='l2',
                                            tol=1e-2, C=1
                                            )

    cate_lr = logistic_estimator.fit(X_train_tfidf_transformed, cate_train_data['label'])
    
    ## Test the model
    
    X_test_transformed_tfidf = tfidf_pipeline.transform(cate_test_data['text'])
    y_pred = cate_lr.predict(X_test_transformed_tfidf)
    
    print(f'************* {cate} *************')
    print(classification_report(cate_test_data['label'], y_pred))

 25%|██▌       | 1/4 [00:02<00:08,  2.76s/it]

************* electronics *************
              precision    recall  f1-score   support

    negative       0.85      0.84      0.84      2824
    positive       0.84      0.85      0.85      2857

    accuracy                           0.85      5681
   macro avg       0.85      0.85      0.85      5681
weighted avg       0.85      0.85      0.85      5681



 50%|█████     | 2/4 [00:05<00:05,  2.54s/it]

************* kitchen *************
              precision    recall  f1-score   support

    negative       0.86      0.86      0.86      2991
    positive       0.86      0.85      0.86      2954

    accuracy                           0.86      5945
   macro avg       0.86      0.86      0.86      5945
weighted avg       0.86      0.86      0.86      5945



 75%|███████▌  | 3/4 [00:08<00:03,  3.01s/it]

************* dvd *************
              precision    recall  f1-score   support

    negative       0.85      0.80      0.82      1779
    positive       0.81      0.86      0.84      1807

    accuracy                           0.83      3586
   macro avg       0.83      0.83      0.83      3586
weighted avg       0.83      0.83      0.83      3586



100%|██████████| 4/4 [00:12<00:00,  3.16s/it]

************* books *************
              precision    recall  f1-score   support

    negative       0.82      0.82      0.82      2201
    positive       0.82      0.83      0.83      2264

    accuracy                           0.82      4465
   macro avg       0.82      0.82      0.82      4465
weighted avg       0.82      0.82      0.82      4465






### Naive Bayes

In [7]:
for cate in tqdm(train_data['folder'].unique()):
    
    cate_train_data = train_data[train_data['folder']==cate]
    cate_test_data = test_data[test_data['folder']==cate]
    
    tfidf_pipeline = build_preprocess_pipeline('tfidf').fit(cate_train_data['text'])
    X_train_tfidf_transformed = tfidf_pipeline.transform(cate_train_data['text'])
    
    nb_estimator = MultinomialNB(alpha=1.0)

    cate_nb = nb_estimator.fit(X_train_tfidf_transformed, cate_train_data['label'])
    
    ## Test the model
    
    X_test_transformed_tfidf = tfidf_pipeline.transform(cate_test_data['text'])
    y_pred = cate_nb.predict(X_test_transformed_tfidf)
    
    print(f'************* {cate} *************')
    print(classification_report(cate_test_data['label'], y_pred))

 25%|██▌       | 1/4 [00:02<00:08,  2.68s/it]

************* electronics *************
              precision    recall  f1-score   support

    negative       0.84      0.84      0.84      2824
    positive       0.84      0.84      0.84      2857

    accuracy                           0.84      5681
   macro avg       0.84      0.84      0.84      5681
weighted avg       0.84      0.84      0.84      5681



 50%|█████     | 2/4 [00:05<00:05,  2.52s/it]

************* kitchen *************
              precision    recall  f1-score   support

    negative       0.86      0.83      0.85      2991
    positive       0.84      0.86      0.85      2954

    accuracy                           0.85      5945
   macro avg       0.85      0.85      0.85      5945
weighted avg       0.85      0.85      0.85      5945



 75%|███████▌  | 3/4 [00:08<00:02,  2.98s/it]

************* dvd *************
              precision    recall  f1-score   support

    negative       0.82      0.83      0.82      1779
    positive       0.83      0.83      0.83      1807

    accuracy                           0.83      3586
   macro avg       0.83      0.83      0.83      3586
weighted avg       0.83      0.83      0.83      3586



100%|██████████| 4/4 [00:12<00:00,  3.13s/it]

************* books *************
              precision    recall  f1-score   support

    negative       0.81      0.83      0.82      2201
    positive       0.83      0.82      0.82      2264

    accuracy                           0.82      4465
   macro avg       0.82      0.82      0.82      4465
weighted avg       0.82      0.82      0.82      4465






## TF
### Logistic Regression

In [8]:
for cate in tqdm(train_data['folder'].unique()):
    
    cate_train_data = train_data[train_data['folder']==cate]
    cate_test_data = test_data[test_data['folder']==cate]
    
    cnt_pipeline = build_preprocess_pipeline('count').fit(cate_train_data['text'])
    X_train_cnt_transformed = cnt_pipeline.transform(cate_train_data['text'])
    
    logistic_estimator = LogisticRegression(n_jobs=-1, random_state=42, 
                                            class_weight=None, solver='saga',
                                            max_iter=1000, penalty='l2',
                                            tol=1e-2, C=1
                                            )

    cate_lr = logistic_estimator.fit(X_train_cnt_transformed, cate_train_data['label'])
    
    ## Test the model
    
    X_test_transformed_cnt = cnt_pipeline.transform(cate_test_data['text'])
    y_pred = cate_lr.predict(X_test_transformed_cnt)
    
    print(f'************* {cate} *************')
    print(classification_report(cate_test_data['label'], y_pred))

 25%|██▌       | 1/4 [00:02<00:08,  2.79s/it]

************* electronics *************
              precision    recall  f1-score   support

    negative       0.86      0.85      0.86      2824
    positive       0.86      0.87      0.86      2857

    accuracy                           0.86      5681
   macro avg       0.86      0.86      0.86      5681
weighted avg       0.86      0.86      0.86      5681



 50%|█████     | 2/4 [00:05<00:05,  2.59s/it]

************* kitchen *************
              precision    recall  f1-score   support

    negative       0.86      0.85      0.86      2991
    positive       0.85      0.86      0.86      2954

    accuracy                           0.86      5945
   macro avg       0.86      0.86      0.86      5945
weighted avg       0.86      0.86      0.86      5945



 75%|███████▌  | 3/4 [00:08<00:03,  3.06s/it]

************* dvd *************
              precision    recall  f1-score   support

    negative       0.84      0.77      0.80      1779
    positive       0.79      0.85      0.82      1807

    accuracy                           0.81      3586
   macro avg       0.81      0.81      0.81      3586
weighted avg       0.81      0.81      0.81      3586



100%|██████████| 4/4 [00:13<00:00,  3.32s/it]

************* books *************
              precision    recall  f1-score   support

    negative       0.84      0.81      0.82      2201
    positive       0.82      0.84      0.83      2264

    accuracy                           0.83      4465
   macro avg       0.83      0.83      0.83      4465
weighted avg       0.83      0.83      0.83      4465






### Naive Bayes

In [9]:
for cate in tqdm(train_data['folder'].unique()):
    
    cate_train_data = train_data[train_data['folder']==cate]
    cate_test_data = test_data[test_data['folder']==cate]
    
    cnt_pipeline = build_preprocess_pipeline('count').fit(cate_train_data['text'])
    X_train_cnt_transformed = cnt_pipeline.transform(cate_train_data['text'])
    
    nb_estimator = MultinomialNB(alpha=1.0)

    cate_nb = nb_estimator.fit(X_train_tfidf_transformed, cate_train_data['label'])
    
    ## Test the model
    
    X_test_transformed_cnt = cnt_pipeline.transform(cate_test_data['text'])
    y_pred = cate_nb.predict(X_test_transformed_cnt)
    
    print(f'************* {cate} *************')
    print(classification_report(cate_test_data['label'], y_pred))

 25%|██▌       | 1/4 [00:02<00:08,  2.97s/it]

************* electronics *************
              precision    recall  f1-score   support

    negative       0.50      0.49      0.49      2824
    positive       0.50      0.52      0.51      2857

    accuracy                           0.50      5681
   macro avg       0.50      0.50      0.50      5681
weighted avg       0.50      0.50      0.50      5681



 50%|█████     | 2/4 [00:05<00:05,  2.73s/it]

************* kitchen *************
              precision    recall  f1-score   support

    negative       0.49      0.48      0.49      2991
    positive       0.49      0.51      0.50      2954

    accuracy                           0.49      5945
   macro avg       0.49      0.49      0.49      5945
weighted avg       0.49      0.49      0.49      5945



 75%|███████▌  | 3/4 [00:09<00:03,  3.15s/it]

************* dvd *************
              precision    recall  f1-score   support

    negative       0.51      0.45      0.48      1779
    positive       0.51      0.58      0.54      1807

    accuracy                           0.51      3586
   macro avg       0.51      0.51      0.51      3586
weighted avg       0.51      0.51      0.51      3586



100%|██████████| 4/4 [00:13<00:00,  3.29s/it]

************* books *************
              precision    recall  f1-score   support

    negative       0.82      0.85      0.83      2201
    positive       0.84      0.82      0.83      2264

    accuracy                           0.83      4465
   macro avg       0.83      0.83      0.83      4465
weighted avg       0.83      0.83      0.83      4465






## Lexicons

# Classifier for all categories

### Preprocessing

In [10]:
tfidf_pipeline = build_preprocess_pipeline('tfidf').fit(train_data['text'])
X_train_tfidf_transformed = tfidf_pipeline.transform(train_data['text'])

cnt_pipeline = build_preprocess_pipeline('count').fit(train_data['text'])
X_train_cnt_transformed = cnt_pipeline.transform(train_data['text'])

## TF - IDF
### Logistic Regression

In [11]:
logistic_estimator = LogisticRegression(n_jobs=-1, random_state=42, 
                                        class_weight=None, solver='saga',
                                        max_iter=1000, penalty='l2',
                                        tol=1e-2, C=1
                                        )

cate_lr = logistic_estimator.fit(X_train_tfidf_transformed, train_data['label'])

## Test the model
X_test_transformed_tfidf = tfidf_pipeline.transform(test_data['text'])
y_pred = cate_lr.predict(X_test_transformed_tfidf)

print(classification_report(test_data['label'], y_pred))

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9795
    positive       0.85      0.86      0.85      9882

    accuracy                           0.85     19677
   macro avg       0.85      0.85      0.85     19677
weighted avg       0.85      0.85      0.85     19677



### Naive Bayes

In [12]:
nb_estimator = MultinomialNB(alpha=1.0)

cate_nb = nb_estimator.fit(X_train_tfidf_transformed, train_data['label'])

## Test the model
y_pred = cate_lr.predict(X_test_transformed_tfidf)

print(classification_report(test_data['label'], y_pred))

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9795
    positive       0.85      0.86      0.85      9882

    accuracy                           0.85     19677
   macro avg       0.85      0.85      0.85     19677
weighted avg       0.85      0.85      0.85     19677



## TF
### Logistic Regression

In [13]:
logistic_estimator = LogisticRegression(n_jobs=-1, random_state=42, 
                                        class_weight=None, solver='saga',
                                        max_iter=1000, penalty='l2',
                                        tol=1e-2, C=1
                                        )

cate_lr = logistic_estimator.fit(X_train_cnt_transformed, train_data['label'])

## Test the model
X_test_transformed_cnt = cnt_pipeline.transform(test_data['text'])
y_pred = cate_lr.predict(X_test_transformed_cnt)

print(classification_report(test_data['label'], y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      9795
    positive       0.84      0.86      0.85      9882

    accuracy                           0.85     19677
   macro avg       0.85      0.85      0.85     19677
weighted avg       0.85      0.85      0.85     19677



### Naive Bayes

In [14]:
logistic_estimator = MultinomialNB(alpha=1.0)

cate_lr = logistic_estimator.fit(X_train_cnt_transformed, train_data['label'])

## Test the model
y_pred = cate_lr.predict(X_test_transformed_cnt)

print(classification_report(test_data['label'], y_pred))

              precision    recall  f1-score   support

    negative       0.82      0.83      0.82      9795
    positive       0.83      0.83      0.83      9882

    accuracy                           0.83     19677
   macro avg       0.83      0.83      0.83     19677
weighted avg       0.83      0.83      0.83     19677



## Lexicons