# Here is an example to show how three parts of the data should be used. Feel free to change the models to get better scores.

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
def load_data(split_name='train', columns=['text', 'stars']):
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        df = df.loc[:,columns]
        print("succeed!")
        return df
    except:
        print("Failed, then try to ")
        print(f"select all columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        return df

In [3]:
train_df = load_data('train')
valid_df = load_data('valid')
test_df = load_data('test')

select [text, stars] columns from the train split
succeed!
select [text, stars] columns from the valid split
succeed!
select [text, stars] columns from the test split
Failed, then try to 
select all columns from the test split


In [4]:
test_df

Unnamed: 0,business_id,cool,date,funny,review_id,text,useful,user_id
0,7YYrZ9LgjpKLTtF-huhJug,0,2018-04-04 21:21:45,0,b8-ELBwhmDKcmcM8icT86g,I took the UP Train to Union Station to catch ...,0,9Lglv-v8SRo_S-IyvFBmbw
1,gyNixTgp1yFX97soBZpZ7Q,1,2013-07-10 00:04:01,0,rBpAJhIen_V-zLoXZIcROg,We worked with Fitness with a Twist for part o...,1,zIl62G84XT2BwSIAjjjvYw
2,vNWfQrQCa_XijstJbylcDQ,1,2015-10-28 01:23:21,2,_pALaDG6se9OTkGGhyhnNA,"It's your typical, average, run-of-the-mill co...",1,WP7FsUsgNW24s7HH5xi7pg
3,wfxmuA7LbKZKVLV58EiWBw,0,2015-11-19 03:48:40,0,ru8fpA1Uk0tTFtO5hLM49g,We went to Outback today to celebrate my daugh...,0,yLSj54f2YgGQu-lhPIhMTQ
4,5jTmjxb1X34EfcY1gos4tw,0,2016-06-04 23:29:46,0,fRPgwuFoY6SriToXZyaOQA,We Went to see Nashville unplugged a country c...,1,73-u0a3G9Le4GWG7zLYWtg
...,...,...,...,...,...,...,...,...
1995,rKl9yHz4LmQzD70yXBaRlg,0,2016-04-24 02:43:01,0,oZxqo6rspUimmoqKl0_qdA,This was an ok Burger joint. This chain is big...,1,XxG5SZOPkihWeJe3r9XL8Q
1996,Hqs4YNST_ZHbshwyi4bnsQ,3,2017-12-30 08:01:06,1,C_oCpNq91uTtOYUs2cChdg,Came here on a whim as we were passing through...,3,acPFDB6xB5qtcwTCI2wLvg
1997,46hnat0aLao-qYWZkN9aBg,1,2010-03-28 15:40:22,1,qMrxDB5fm8wIzexUp643xQ,This is my favorite Mexican food place on the ...,1,TJoILHOxI_K5kJUye0BtbA
1998,BGGQOJQTQerEQu0kHbT_UQ,0,2016-05-02 22:24:13,0,HrejfPp6Xduy5Pv7i-is1A,Stopped in here on 3/24/16 around 8pm when the...,0,PTHCHcBhJbyNS3hMSQ2pYQ


In [5]:
# prepare the data
x_train = train_df['text']
y_train = train_df['stars']
x_valid = valid_df['text']
y_valid = valid_df['stars']
x_test = test_df['text']

# You can use the valid data to choose the hyperparameter.
# In this case, you can decide which value of C (1 or 100) is better by evaluating on the valid data.

In [6]:
# build the first linear model with TFIDF feature
tfidf = TfidfVectorizer()
lr1 = LogisticRegression(C=100)
steps = [('tfidf', tfidf),('lr', lr1)]
pipe1 = Pipeline(steps)

In [7]:
# train the first model
pipe1.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('lr', LogisticRegression(C=100))])

In [8]:
# do the validation of your validation set on the hyper-parameter
y_pred = pipe1.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.74      0.77      0.76       517
           2       0.37      0.28      0.32       278
           3       0.39      0.42      0.41       344
           4       0.45      0.49      0.47       427
           5       0.66      0.63      0.64       434

    accuracy                           0.55      2000
   macro avg       0.52      0.52      0.52      2000
weighted avg       0.55      0.55      0.55      2000




[[398  66  39   7   7]
 [ 87  79  76  24  12]
 [ 32  56 145  94  17]
 [ 12   9  91 209 106]
 [  6   6  18 131 273]]
accuracy 0.552


In [9]:
# build the second linear model with TFIDF feature
tfidf = TfidfVectorizer()
lr2 = LogisticRegression(C=1)
steps = [('tfidf', tfidf),('lr', lr2)]
pipe2 = Pipeline(steps)

In [10]:
# train the second model
pipe2.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(C=1))])

In [11]:
# do the validation of your validation set on the hyper-parameter
y_pred = pipe2.predict(x_valid)
print(classification_report(y_valid, y_pred))
print("\n\n")
print(confusion_matrix(y_valid, y_pred))
print('accuracy', np.mean(y_valid == y_pred))

              precision    recall  f1-score   support

           1       0.70      0.88      0.78       517
           2       0.41      0.19      0.26       278
           3       0.48      0.45      0.46       344
           4       0.50      0.53      0.52       427
           5       0.68      0.70      0.69       434

    accuracy                           0.60      2000
   macro avg       0.55      0.55      0.54      2000
weighted avg       0.58      0.60      0.58      2000




[[453  27  19  13   5]
 [114  54  79  21  10]
 [ 45  37 155  93  14]
 [ 17  11  61 226 112]
 [ 19   2  11  97 305]]
accuracy 0.5965


# We find the second model (pipe2) is better, then we use the second model to make predictions on test data

In [12]:
predict_test = pipe2.predict(x_test)

In [13]:
predict_test

array([1, 5, 3, ..., 5, 5, 4])

In [14]:
# save your predictions
pred_df = pd.DataFrame({'stars': predict_test, 'review_id': test_df['review_id']})
pred_df.to_csv('pred.csv', index=False)

# Then you can submit your predictions `pred.csv` on test set. TAs will evaluate your predictions