In [13]:
import pandas as pd
df = pd.read_csv("../final_proj_dataset.csv")
text = df['speech']
target_party = df['political_party']
target_pres = df['president']
target_period = df['time_period']

In [14]:
import json
keyVals = json.load(open('../presidentKeys.json'))
target_pres = target_pres.apply(lambda x: list(keyVals.keys())[list(keyVals.values()).index(x)])
target_pres

0       2
1       2
2       2
3       2
4       2
       ..
992    28
993    28
994    28
995    28
996    28
Name: president, Length: 997, dtype: object

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

estimators = {
    "Gaussian Naive Bayes": GaussianNB(),
    "MultinomialNB": MultinomialNB(alpha = 0.5),
    "k-Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression()
}
vectorizers = {
    "Count Vectorizer": CountVectorizer(),
    "TFIDF Vectorizer": TfidfVectorizer(),
    "TFIDF Vectorizer w/ min_df=2": TfidfVectorizer(min_df=2),
    "TFIDF Vectorizer w/ min_df=5": TfidfVectorizer(min_df=5),
    "TFIDF Vectorizer w/ min_df=2, StopWords": TfidfVectorizer(min_df=2, stop_words="english"),
    "TFIDF Vectorizer w/ min_df=5, StopWords": TfidfVectorizer(min_df=5, stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=2": TfidfVectorizer(min_df=2, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=2": TfidfVectorizer(min_df=2, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=5": TfidfVectorizer(min_df=5, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=5": TfidfVectorizer(min_df=5, ngram_range=(1,2)),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=2, StopWords": TfidfVectorizer(min_df=2, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=2, StopWords": TfidfVectorizer(min_df=2, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,2), min_df=5, StopWords": TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words="english"),
    "TFIDF Vectorizer w/ ngrams=(1,3), min_df=5, StopWords": TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words="english"),
}
all_targets = {
    'Political Party':target_party, 
    'President':target_pres, 
    'Time Period':target_period
}


In [4]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

for target_name, target in all_targets.items():
    print(f"{target_name} models:")
    
    for vect_name, vectorizer in vectorizers.items():
        print(f"\tUsing {vect_name} as vect:")
        
        for estimator_name, estimator in estimators.items():
            X_train, X_test, y_train, y_test = train_test_split(text, target)
            
            vect = vectorizer.fit(X_train)
            
            X_train_vect = vect.transform(X_train).toarray()
            X_test_vect = vect.transform(X_test).toarray()
            
            estimator.fit(X=X_train_vect, y=y_train)
            
            print(f"\t\t{estimator_name}:")
            print(f"\t\t\tTrain acc: {estimator.score(X_train_vect, y_train)}")
            print(f"\t\t\tTest  acc: {estimator.score(X_test_vect, y_test)}")
            
# X_train, X_test, y_train, y_test = train_test_split(text, target_party)

# vect = CountVectorizer().fit(X_train)
# #vect = TfidfVectorizer().fit(X_train)

# X_train_vectorized = vect.transform(X_train)
# X_test_vectorized = vect.transform(X_test)

# model = MultinomialNB(alpha = 0.5).fit(X=X_train_vectorized, y=y_train)
# print(model.score(X_train_vectorized, y_train))
# print(model.score(X_test_vectorized, y_test))

Political Party models:
	Using Count Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 0.9852744310575636
			Test  acc: 0.624
		MultinomialNB:
			Train acc: 0.8808567603748326
			Test  acc: 0.68
		k-Nearest Neighbor:
			Train acc: 0.6305220883534136
			Test  acc: 0.472
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.504


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


		Logistic Regression:
			Train acc: 0.9892904953145917
			Test  acc: 0.716
	Using TFIDF Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.644
		MultinomialNB:
			Train acc: 0.3453815261044177
			Test  acc: 0.32
		k-Nearest Neighbor:
			Train acc: 0.6546184738955824
			Test  acc: 0.584
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.46
		Logistic Regression:
			Train acc: 0.6720214190093708
			Test  acc: 0.52
	Using TFIDF Vectorizer w/ min_df=2 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.56
		MultinomialNB:
			Train acc: 0.4685408299866131
			Test  acc: 0.352
		k-Nearest Neighbor:
			Train acc: 0.6666666666666666
			Test  acc: 0.532
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.484
		Logistic Regression:
			Train acc: 0.678714859437751
			Test  acc: 0.54
	Using TFIDF Vectorizer w/ min_df=5 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.524
		MultinomialNB:
			Train acc: 0.4390896921017403
			Test  acc: 0.376

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


		Logistic Regression:
			Train acc: 0.9879518072289156
			Test  acc: 0.644
	Using TFIDF Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.488
		MultinomialNB:
			Train acc: 0.09504685408299866
			Test  acc: 0.072
		k-Nearest Neighbor:
			Train acc: 0.6224899598393574
			Test  acc: 0.38
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.348
		Logistic Regression:
			Train acc: 0.5515394912985274
			Test  acc: 0.4
	Using TFIDF Vectorizer w/ min_df=2 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.456
		MultinomialNB:
			Train acc: 0.13119143239625167
			Test  acc: 0.088
		k-Nearest Neighbor:
			Train acc: 0.5876840696117804
			Test  acc: 0.376
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.312
		Logistic Regression:
			Train acc: 0.5127175368139224
			Test  acc: 0.356
	Using TFIDF Vectorizer w/ min_df=5 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.42
		MultinomialNB:
			Train acc: 0.17670682730923695
			Test  acc: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


		Logistic Regression:
			Train acc: 0.9933065595716198
			Test  acc: 0.708
	Using TFIDF Vectorizer as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.672
		MultinomialNB:
			Train acc: 0.19812583668005354
			Test  acc: 0.168
		k-Nearest Neighbor:
			Train acc: 0.6452476572958501
			Test  acc: 0.552
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.48
		Logistic Regression:
			Train acc: 0.6934404283801874
			Test  acc: 0.604
	Using TFIDF Vectorizer w/ min_df=2 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.604
		MultinomialNB:
			Train acc: 0.24230254350736277
			Test  acc: 0.196
		k-Nearest Neighbor:
			Train acc: 0.6760374832663989
			Test  acc: 0.54
		Decision Tree:
			Train acc: 1.0
			Test  acc: 0.468
		Logistic Regression:
			Train acc: 0.7215528781793842
			Test  acc: 0.532
	Using TFIDF Vectorizer w/ min_df=5 as vect:
		Gaussian Naive Bayes:
			Train acc: 1.0
			Test  acc: 0.576
		MultinomialNB:
			Train acc: 0.2101740294511379
			Test  acc:

In [7]:
# Doing TfidfVectorizer things:
from sklearn.model_selection import GridSearchCV

min_df_ops = [2,3,4,5]
stop_words_ops = [None, "english"]
ngram_range_ops = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
#params = {"min_df":[2,3,4,5], "stop_words":[None, "enlgish"], "ngram_range":[(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]}


for target_name, target in all_targets.items():
    X_train, X_test, y_train, y_test = train_test_split(text, target)
    if target_name == "Time Period":
        continue
    print(f"\n{target_name}")
    for m_df in min_df_ops:
        for stops in stop_words_ops:
            for ngrams in ngram_range_ops:
                vect = TfidfVectorizer(min_df=m_df, ngram_range=ngrams, stop_words=stops)
                vect = vect.fit(X_train)

                estimator = KNeighborsClassifier()

                X_train_vect = vect.transform(X_train).toarray()
                X_test_vect = vect.transform(X_test).toarray()

                estimator.fit(X=X_train_vect, y=y_train)
                print(f"df={df}, stops={stops}, ngram_range={ngrams}")
                print(f"train={estimator.score(X_train_vect, y_train)}")
                print(f"testt={estimator.score(X_test_vect, y_test)}")


Political Party
df=2, stops=None, ngram_range=(1, 1)
train=0.6398929049531459
testt=0.532
df=2, stops=None, ngram_range=(1, 2)
train=0.4886211512717537
testt=0.404
df=2, stops=None, ngram_range=(2, 2)
train=0.6211512717536813
testt=0.488
df=2, stops=None, ngram_range=(1, 3)
train=0.4149933065595716
testt=0.348
df=2, stops=None, ngram_range=(2, 3)
train=0.6506024096385542
testt=0.504
df=2, stops=None, ngram_range=(3, 3)
train=0.8340026773761714
testt=0.748
df=2, stops=english, ngram_range=(1, 1)
train=0.7657295850066934
testt=0.692
df=2, stops=english, ngram_range=(1, 2)
train=0.7617135207496654
testt=0.716
df=2, stops=english, ngram_range=(2, 2)
train=0.8393574297188755
testt=0.764
df=2, stops=english, ngram_range=(1, 3)
train=0.7643908969210174
testt=0.72
df=2, stops=english, ngram_range=(2, 3)
train=0.8487282463186078
testt=0.72
df=2, stops=english, ngram_range=(3, 3)
train=0.821954484605087
testt=0.648
df=3, stops=None, ngram_range=(1, 1)
train=0.6492637215528781
testt=0.54
df=3, s

In [6]:
# Doing TfidfVectorizer things:
from sklearn.model_selection import GridSearchCV

min_df_ops = [2,3,4,5]
stop_words_ops = [None, "english"]
ngram_range_ops = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
#params = {"min_df":[2,3,4,5], "stop_words":[None, "enlgish"], "ngram_range":[(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]}

estimator = MLPClassifier(solver="lbfgs", hidden_layer_sizes=(100, 50))

X_train, X_test, y_train, y_test = train_test_split(text, target_period)
    
vect = TfidfVectorizer(min_df=2, ngram_range=(1,2), stop_words="english").fit(X_train)

X_train_vect = vect.transform(X_train).toarray()
X_test_vect = vect.transform(X_test).toarray()


# estimator.fit(X=X_train_vect, y=y_train)

# print(f"\t\t{estimator_name}:")
# print(f"\t\t\tTrain acc: {estimator.score(X_train_vect, y_train)}")
# print(f"\t\t\tTest  acc: {estimator.score(X_test_vect, y_test)}")