# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn_pandas import DataFrameMapper

# Load data

In [2]:
df = pd.read_csv("races_text_merged.csv")
df = df.drop('Unnamed: 0', axis=1)

In [3]:
df.head()

Unnamed: 0,Race,Date,WPM,Accuracy,Text_Id,Date_Epoch,Date_Month,Date_Year,Date_Day,Outcome_Rank,Outcome_No_Racers,Length,Text,Difficulty Rating,Average,Punctuation_Count,Capital_Case_Count
0,4126,2018-12-22 00:01:33,78.76,0.96,3810450,1545454893,December,2018,22,3,3,317,"Look, Ross, you gotta understand, between us w...",1.057,98.03,12,6
1,4125,2018-12-22 00:00:25,90.25,0.98,3641234,1545454825,December,2018,22,2,5,424,Harry could hardly believe this was real. Four...,0.986,93.64,10,7
2,4124,2018-09-27 01:55:37,100.13,0.98,499,1538031337,September,2018,27,1,2,297,Ideology: the process of making ideas. Certain...,0.95,61.94,5,3
3,1195,2016-02-18 21:15:18,65.89,0.91,499,1455848118,February,2016,18,2,3,297,Ideology: the process of making ideas. Certain...,0.95,61.94,5,3
4,1039,2016-01-27 08:40:02,68.13,0.95,499,1453902002,January,2016,27,2,3,297,Ideology: the process of making ideas. Certain...,0.95,61.94,5,3


# Define classification objective and prepare classes accordingly

In [21]:
df[(df['WPM']>75) & (df['WPM']< 85)].shape

(1168, 17)

In [22]:
df[(df['WPM']>85) & (df['WPM']< 95)].shape

(959, 17)

In [23]:
df[(df['WPM']>95)].shape

(446, 17)

In [24]:
wpm_class = list()
df_75 = df[(df['WPM']>75)]
for index, row in df.iterrows():
    if row['WPM'] >= 75.0 and row['WPM'] <= 85.0:
        wpm_class.append(0)
    elif row['WPM'] >= 85.0 and row['WPM'] <= 95.0:
        wpm_class.append(1)
    elif row['WPM']>=95.0:
        wpm_class.append(2)
        
df_75['wpm_class'] = wpm_class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [25]:
df_75[['WPM', 'wpm_class']].head()

Unnamed: 0,WPM,wpm_class
0,78.76,0
1,90.25,1
2,100.13,2
6,85.26,1
7,95.85,2


In [26]:
df_75['wpm_class'].value_counts()

0    1171
1     960
2     446
Name: wpm_class, dtype: int64

# Train Test Split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df_75['Text'], df_75['wpm_class'], test_size=0.2,
                                                   random_state=42)

# Create an ML pipeline

In [58]:
df_train = df_75.sample(frac = 0.75, random_state=42)

In [59]:
df_test = df_75.loc[~df_75.index.isin(df_train.index)]

In [60]:
vec0 = CountVectorizer(lowercase=False, analyzer='word',ngram_range=(2, 5))
vec1 = TfidfVectorizer(lowercase=False, analyzer='word', ngram_range=(2, 5))

In [61]:
mapper = DataFrameMapper([
    ('Text', vec1),
    ('Punctuation_Count', None),
    ('Capital_Case_Count', None),
    ('Length', None),
    ('Difficulty Rating', None)
], sparse = True)

In [62]:
pipeline = Pipeline([
    ('featurizer', mapper),
    ('clf', LogisticRegression(random_state=0)),
])

In [63]:
pipeline.fit(df_train, df_train['wpm_class'])

Pipeline(memory=None,
     steps=[('featurizer', DataFrameMapper(default=False, df_out=False,
        features=[('Text', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [64]:
pipeline.score(df_test, df_test['wpm_class'])

0.4984472049689441

In [65]:
y_pred = pipeline.predict(df_test)
print(classification_report(df_test['wpm_class'], y_pred))

              precision    recall  f1-score   support

           0       0.54      0.79      0.64       303
           1       0.39      0.28      0.33       234
           2       0.48      0.15      0.23       107

   micro avg       0.50      0.50      0.50       644
   macro avg       0.47      0.41      0.40       644
weighted avg       0.48      0.50      0.46       644



In [129]:
import pickle

with open("typeracer_model.pkl", "wb") as outfile:
    pickle.dump(pipeline, outfile)