In [63]:
import pandas as pd
df = pd.read_csv("../final_proj_dataset.csv")
text = df['speech']
target_party = df['political_party']
target_pres = df['president']
target_period = df['time_period']

In [64]:
import json
keyVals = json.load(open('../presidentKeys.json'))
target_pres = target_pres.apply(lambda x: list(keyVals.keys())[list(keyVals.values()).index(x)])
target_pres

0       2
1       2
2       2
3       2
4       2
       ..
992    28
993    28
994    28
995    28
996    28
Name: president, Length: 997, dtype: object

In [81]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

estimators = {
    "Gaussian Naive Bayes": GaussianNB(),
    "MultinomialNB": MultinomialNB(alpha = 0.5),
    "k-Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}
vectorizers = {
    "Count Vectorizer": CountVectorizer(),
    "TFIDF Vectorizer": TfidfVectorizer(),
    "TFIDF Vectorizer w/ min_df=2": TfidfVectorizer(min_df=2),
    "TFIDF Vectorizer w/ min_df=5": TfidfVectorizer(min_df=5),
    "TFIDF Vectorizer w/ min_df=2, StopWords": TfidfVectorizer(min_df=2, stop_words="english"),
    "TFIDF Vectorizer w/ min_df=5, StopWords": TfidfVectorizer(min_df=5, stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=2": TfidfVectorizer(min_df=2, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=2": TfidfVectorizer(min_df=2, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=5": TfidfVectorizer(min_df=5, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=5": TfidfVectorizer(min_df=5, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=2, StopWords": TfidfVectorizer(min_df=2, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=2, StopWords": TfidfVectorizer(min_df=2, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=5, StopWords": TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=5, StopWords": TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words="english"),
}
all_targets = {
    'Political Party':target_party, 
    'President':target_pres, 
    'Time Period':target_period
}


In [82]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

for target_name, target in all_targets.items():
    print(f"{target_name} models:")
    
    for vect_name, vectorizer in vectorizers.items():
        print(f"\tUsing {vect_name} as vect:")
        
        for estimator_name, estimator in estimators.items():
            X_train, X_test, y_train, y_test = train_test_split(text, target)
            
            vect = vectorizer.fit(X_train)
            
            X_train_vect = vect.transform(X_train).toarray()
            X_test_vect = vect.transform(X_test).toarray()
            
            estimator.fit(X=X_train_vect, y=y_train)
            
            print(f"\t\t{estimator_name}:")
            print(f"\t\t\tTrain acc: {estimator.score(X_train_vect, y_train)}")
            print(f"\t\t\tTest  acc: {estimator.score(X_test_vect, y_test)}")
            
# X_train, X_test, y_train, y_test = train_test_split(text, target_party)

# vect = CountVectorizer().fit(X_train)
# #vect = TfidfVectorizer().fit(X_train)

# X_train_vectorized = vect.transform(X_train)
# X_test_vectorized = vect.transform(X_test)

# model = MultinomialNB(alpha = 0.5).fit(X=X_train_vectorized, y=y_train)
# print(model.score(X_train_vectorized, y_train))
# print(model.score(X_test_vectorized, y_test))

Political Party models:
	Using Count Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 0.9839357429718876
			Test  acc: 0.592
		MultinomialNB:
			Train acc: 0.8714859437751004
			Test  acc: 0.72
		k-Nearest Neighbor:
			Train acc: 0.6198125836680054
			Test  acc: 0.428
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.5
	Using TFIDF Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.588
		MultinomialNB:
			Train acc: 0.39759036144578314
			Test  acc: 0.288
		k-Nearest Neighbor:
			Train acc: 0.6224899598393574
			Test  acc: 0.516
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.444
	Using TFIDF Vectorizer w/ min_df=2 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.584
		MultinomialNB:
			Train acc: 0.4149933065595716
			Test  acc: 0.392
		k-Nearest Neighbor:
			Train acc: 0.6345381526104418
			Test  acc: 0.5
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.444
	Using TFIDF Vectorizer w/ min_df=5 as vect:
		Gaussian Naive Bayes:

			Test  acc: 0.576
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.34
	Using TFIDF Vectorizer w/ ngrams=(1,2), min_df=5, StopWords as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.484
		MultinomialNB:
			Train acc: 0.5194109772423026
			Test  acc: 0.204
		k-Nearest Neighbor:
			Train acc: 0.7309236947791165
			Test  acc: 0.628
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.348
	Using TFIDF Vectorizer w/ ngrams=(1,3), min_df=5, StopWords as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.528
		MultinomialNB:
			Train acc: 0.6104417670682731
			Test  acc: 0.336
		k-Nearest Neighbor:
			Train acc: 0.7255689424364123
			Test  acc: 0.596
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.372
Time Period models:
	Using Count Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.712
		MultinomialNB:
			Train acc: 0.9558232931726908
			Test  acc: 0.764
		k-Nearest Neighbor:
			Train acc: 0.5943775100401606
			Test  acc: 0.432
		