In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('data/8_sentences_train.csv',sep='|',index_col=0)
df_test = pd.read_csv('data/8_sentences_test.csv',sep='|',index_col=0)

In [None]:
df_train_pos = df_train[df_train['label']==1]
df_train_neg = df_train[df_train['label']==0].sample(n = df_train_pos.shape[0])

df_array = [df_train_pos,df_train_neg]
df_train_balanced = pd.concat(df_array)
df_train_balanced = df_train_balanced.sample(frac=1)
df_train_balanced = df_train_balanced.fillna("")

In [None]:
all_sentences = df_train_balanced['sentence'] + ' ' + \
                df_train_balanced['prev1'] + ' ' + \
                df_train_balanced['prev2'] + ' ' + \
                df_train_balanced['prev3'] + ' ' + \
                df_train_balanced['prev4'] + ' ' + \
                df_train_balanced['prev5'] + ' ' + \
                df_train_balanced['prev6'] + ' ' + \
                df_train_balanced['prev7'] + ' ' + \
                df_train_balanced['prev8']

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
vectorizer.fit(all_sentences.values.astype('U'))

In [None]:
del all_sentences

In [None]:
X_sentence = vectorizer.transform(df_train_balanced['sentence'].values.astype('U'))
X_prev1 = vectorizer.transform(df_train_balanced['prev1'].values.astype('U'))
X_prev2 = vectorizer.transform(df_train_balanced['prev2'].values.astype('U'))
X_prev3 = vectorizer.transform(df_train_balanced['prev3'].values.astype('U'))
X_prev4 = vectorizer.transform(df_train_balanced['prev4'].values.astype('U'))
X_prev5 = vectorizer.transform(df_train_balanced['prev5'].values.astype('U'))
X_prev6 = vectorizer.transform(df_train_balanced['prev6'].values.astype('U'))
X_prev7 = vectorizer.transform(df_train_balanced['prev7'].values.astype('U'))
X_prev8 = vectorizer.transform(df_train_balanced['prev8'].values.astype('U'))

list_sentence = [X_sentence,X_prev1,X_prev2,X_prev3,X_prev4,X_prev5,X_prev6,X_prev7,X_prev8]

In [None]:
X_sentence_test = vectorizer.transform(df_test['sentence'].values.astype('U'))
X_prev1_test = vectorizer.transform(df_test['prev1'].values.astype('U'))
X_prev2_test = vectorizer.transform(df_test['prev2'].values.astype('U'))
X_prev3_test = vectorizer.transform(df_test['prev3'].values.astype('U'))
X_prev4_test = vectorizer.transform(df_test['prev4'].values.astype('U'))
X_prev5_test = vectorizer.transform(df_test['prev5'].values.astype('U'))
X_prev6_test = vectorizer.transform(df_test['prev6'].values.astype('U'))
X_prev7_test = vectorizer.transform(df_test['prev7'].values.astype('U'))
X_prev8_test = vectorizer.transform(df_test['prev8'].values.astype('U'))

list_sentence_test = [X_sentence_test,X_prev1_test,X_prev2_test,X_prev3_test,
                 X_prev4_test,X_prev5_test,X_prev6_test,X_prev7_test,X_prev8_test]

In [None]:
accuracy_array = np.zeros(9)
for i in range(9):
    if i>0:
        X_train = hstack(list_sentence[:i])
        X_test = hstack(list_sentence_test[:i])
    else:
        X_train = list_sentence[0]
        X_test = list_sentence_test[0]
    y_train = df_train_balanced['label']
    y_test = df_test['label']
    
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_test_predict = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_predict)
    print('i: ',i,'  accuracy: ',accuracy)
    accuracy_array[i] = accuracy

In [None]:
plt.figure(figsize=(12,8))
plt.plot(range(9), accuracy_array)
plt.title('Hyperparameter tuning: Accuracy according to n-gram length and C parameter of logistic regression (C = inverse of regularization strength)')
plt.xlabel('n-gram included in the tf-idf vectorizer from 1 to x')
plt.ylabel('accuracy')
plt.show()