In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import string
import ssl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
%matplotlib inline

In [2]:
# solve SSL error on Mac OS X
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [17]:
df = pd.read_csv('training_data/wav2letter.txt', sep=",", header=None)
df.columns = ["sentence", "label"]
df.head()

Unnamed: 0,sentence,label
0,had lived nearly twenty one years in the world,0
1,most affectionate indulgent father and had in,1
2,her mother had died too long ago for her to,0
3,have more than an indistinct remembrance of he...,1
4,and her place had been supplied by an excellen...,0


In [22]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.label==0]
df_minority = df[df.label==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.label.value_counts()

data = df_upsampled.copy()

In [23]:
def text_process(sentence):
    nopunc =[char for char in sentence if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split()]
    return text

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

bow_transformer = CountVectorizer(analyzer=text_process).fit(data['sentence'])
sentence_bow = bow_transformer.transform(data["sentence"])

36516


In [26]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer().fit(sentence_bow)
sentence_tfidf = tfidf_transformer.transform(sentence_bow)

In [28]:
features_train, features_test, labels_train, labels_test = train_test_split(sentence_tfidf, data['label'], test_size=0.3, random_state=111)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [30]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=10)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)
abc = AdaBoostClassifier(n_estimators=31, random_state=111)
bc = BaggingClassifier(n_estimators=10, random_state=111)
etc = ExtraTreesClassifier(n_estimators=10, random_state=111)

In [31]:
clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc, 'AdaBoost': abc, 'BgC': bc, 'ETC': etc}

In [32]:
def train_classifier(clf, feature_train, labels_train):    
    clf.fit(feature_train, labels_train)
    
def predict_labels(clf, features):
    return (clf.predict(features))

In [None]:
pred_scores = []
for k,v in clfs.items():
    train_classifier(v, features_train, labels_train)
    pred = predict_labels(v,features_test)
    pred_scores.append((k, [accuracy_score(labels_test,pred)]))
    print("Classifier: " + k)
    print(classification_report(labels_test, pred))