In [1]:
import pandas as pd
import numpy as np

import operator
import random

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import f1_score

from pathlib import Path

from tqdm import tqdm

In [2]:
training_path = Path().resolve().parents[0] / 'data' / 'clean_training_data.csv'

In [3]:
df = pd.read_csv(training_path)

In [4]:
df.columns

Index(['Unnamed: 0', 'original_text', 'label', 'text_length',
       'complexity_scores', 'token_count', 'stopword_count', 'sum_cs',
       'avg_cs'],
      dtype='object')

In [5]:
df = df.drop(['Unnamed: 0', 'original_text'], axis=1)
df = df.dropna()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['text_length', 'token_count', 'stopword_count', 
        'sum_cs', 'avg_cs']], df.label, test_size=0.2)

In [7]:
F1_score_df = pd.DataFrame(columns = ['model','text_length', 'token_count', 'stopword_count', 
        'sum_cs', 'avg_cs'])
models = ['LinearSVC','GaussianNB','MultinomialNB','RandomForestClassifier','AdaBoostClassifier']
F1_score_df['model'] = models

In [8]:
SVC = LinearSVC()
GNB = GaussianNB()
MNB = MultinomialNB()
RFC = RandomForestClassifier()
ABC = AdaBoostClassifier()

trainers = [SVC, GNB, MNB, RFC, ABC]
features = ['text_length','token_count','stopword_count','sum_cs','avg_cs']

index_count=0

for trainer in tqdm(trainers):
    for feature in features:
        trainer.fit(X_train[feature].values.reshape(-1,1), y_train)
        predictions = trainer.predict(X_test[feature].values.reshape(-1,1))
        F1_score_df.at[index_count,feature] = f1_score(y_test, predictions)
        #print(index_count, feature, f1_score(y_test, predictions))
    index_count+=1
    

index_count = 0    
for trainer in tqdm(trainers):
    trainer.fit(X_train, y_train)
    predictions = trainer.predict(X_test)
    F1_score_df.at[index_count,'all'] = f1_score(y_test, predictions)
    index_count+=1


100%|██████████| 5/5 [07:07<00:00, 85.53s/it] 
100%|██████████| 5/5 [01:44<00:00, 20.88s/it]


In [9]:
F1_score_df

Unnamed: 0,model,text_length,token_count,stopword_count,sum_cs,avg_cs,all
0,LinearSVC,0.345473,0.620011,0.550464,0.67341,0.667378,0.001009
1,GaussianNB,0.527426,0.514891,0.511717,0.491069,0.634302,0.54292
2,MultinomialNB,0.667378,0.667378,0.667378,0.667378,0.667378,0.593224
3,RandomForestClassifier,0.659501,0.619978,0.663055,0.619196,0.567369,0.615517
4,AdaBoostClassifier,0.659501,0.620011,0.663068,0.598456,0.634175,0.634751
