In [None]:
#Part 1: Downloading and preprocessing the twitter data for cities with the highest COVID 
#case counts

import csv
import json
import snscrape.modules.twitter as sntwitter
import datetime
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
import operator
from gensim.utils import simple_preprocess



# Two reference documents, one with records of cumulative COVID cases by county, 
#and county seats(municipalities) by county

covidhandle=open('time_series_covid19_confirmed_US.csv')
covidcsv=csv.reader(covidhandle)


seatshandle=open('CountySeats.csv')
seatscsv=csv.reader(seatshandle)

#Convert table of Counties and County Seats to a lookup table
#because COVID data is by county, but twitter users are more
#likely to list their location by city than county

seatslookup=dict()
for i in seatscsv:
    seatslookup[i[0]]=i[1]


#Function that will find the dates where COVID cases crossed certain
#thresholds in different counties. Cutoff is counties with at least
#100,000 total cases
#After analysis, only 3 time periods are used to try to make the periods easier to 
#differentiate

def linegrab(L):
    if float(L[-1])>120000:
        K1,K10,K30,K75,K100=0,0,0,0,0
        

        for i,ct in enumerate(L[11:]):
            if K1==0:
                if float(ct)>1000:
                    K1=i
            elif K10==0:
                if float(ct)>10000:
                    K10=i
            elif K30==0:
                if float(ct)>30000:
                    K30=i
            elif K75==0:
                if float(ct)>60000:
                    K75=i
            elif K100==0:
                if float(ct)>100000:
                    K100=i

        #return [K1,K10,K30,K75,K100]
        return [K1,K10,K30,K100]
    else:
        return []
        

        
        
#First 11 fields are location data, daily COVID reports start after
#First if clause takes dates, linegrab function finds the milestone
#dates if the county is over 100K, and records them in a dictionary
#that has the city (looked up from county seats) and milestone dates
        
spikes=[]

for i,line in enumerate(covidcsv):
    if i==0:
        cdates=line[11:]       
    else:
        milestone=linegrab(line)
        if len(milestone)>0:
            msdate=[]
            for m in milestone:
                msdate.append(cdates[m])
            spikes.append({'city':seatslookup[line[5]],'county':line[5],'state':line[6],'dates':msdate})

covidhandle.close()
seatshandle.close()

#Punctuation removal for lemmatization
def remove_punctuation(text):
    no_punct="".join([c for c in text if c not in string.punctuation])
    return no_punct

#Iterate through all the cities that met the threshold and search twitter on all of the 
#milestone dates nspike is the collection of most frequent words for each city, 
#for higher level summarization. ttwe is tokenized version of every tweet for more 
#involved natural language analysis


nspike=[]
ttwe=[]


for s in spikes:
    da=0
  
    for d in s['dates']:
        da+=1
        #Print when iterating to track progress, as it takes a while
        print('city',s['city'],'dates',d)
        
        #Format date from COVID file for twitter to accept and search twitter near that 
        #city and on that date
        city=s['city']
        state=s['state']
        county=s['county']
        date=datetime.datetime.strptime(d, '%m/%d/%Y').strftime('%Y-%m-%d')
        searchstring='''near:"{}" within:50mi until:{}'''.format(city,date)
        
        #Frequency to store word counts and success count sets the cutoff for how many 
        #tweets per date. Success is finding a tweet with a user who has tagged the city 
        #in question as their location and writes their tweet in English
        
        success=0
        frequency={}
                
        
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(searchstring).get_items()):
            if success>1000:
                break
            else:
                if tweet.lang =='en' and (tweet.user.location==city or tweet.user.location==state):
                    success+=1
                    
                    #Frequency derived by tokenizing, removing stopwords, and lemmatizing
                    
                    tokens = nltk.word_tokenize(tweet.content)
                    clean_tokens=[tok for tok in tokens if len(tok.lower())>1 and 
                                  (tok.lower() not in set(stopwords.words('english')))]

                    cleanTok=[]
                    for w in clean_tokens:
                        cleanT = remove_punctuation(w).lower()
                        if len(cleanT.strip())>1:
                            cleanTok.append(cleanT.strip())

                    lemmatizer = WordNetLemmatizer()
                    textLemma=[]
                    for tok in cleanTok:
                        textLemma.append(lemmatizer.lemmatize(tok))
                        if 'http' in textLemma: textLemma.remove('http')
                            
                    #In addition to lemmatizing for TF-IDF analysis, simple processing for 
                    #use with Word2Vec embeddings
                    
                    w2vtok=simple_preprocess(tweet.content, deacc=True)
                    
                    ttwe.append({'city':city,'state':state,'date':date,'threshold':da,
                                 'tokenized_text':w2vtok,'lemma':textLemma})
                    
                    for w in textLemma:
                        if frequency.get(w,False):
                            frequency[w]+=1
                        else:
                            frequency[w]=1
        
        #Set cutoff for number of words included in frequency
        sorted_freq_dist= sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)
        
        
        
        for sf in sorted_freq_dist[:50]:
            nspike.append({'city':city,'county':county,'state':state,'date':date,'threshold':da,'word':sf[0],
                           'frequency':sf[1]})
            
storage = open('FFF.json','w')
storagetwe = open('TTT.json','w')
json.dump(ttwe,storagetwe)
storagetwe.close()
json.dump(nspike,storage)
storage.close()        


    
 

In [14]:
#Part 2: NLP of collected twitter data to determine possible indicators for COVID in population, with 
#parallel analysis of twitter language by state to see if the amount of COVID cases is more or less of 
#a predictor of language than just the region where the language is from

import json
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import TfidfModel
from gensim import corpora
import gensim
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
try:
    import cPickle as pickle
except ModuleNotFoundError:
    import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

#Earlier version with 4 threshold levels instead of 3 was downloaded in 4 different files 
#to save time. Manually filter second threshold so only dates are the 1K COVID case mark, 
#30K case mark, and 100K case mark. Load to ctweets dataframe for NLP

#with open('TT1.json') as infile:
    df1 = pd.DataFrame(json.load(infile))
#with open('TT2.json') as infile:
#    df2 = pd.DataFrame(json.load(infile))
#with open('TT4.json') as infile:
#    df4 = pd.DataFrame(json.load(infile))
#with open('TT3.json') as infile:
#    df3 = pd.DataFrame(json.load(infile))
    
with open('TTT.json') as infile:
    ctweets1 = pd.DataFrame(json.load(infile))

#Conversion process to single data frame and saved to pickle
#Pickles are used for the dataset, the vectorization models and each of the estimator 
#classes because the code was running simultaneously in different notebooks training 
#different estimators to reduce processing time

ctweets1=pd.concat([df1,df2,df3,df4])
ctweets2=ctweets1[ctweets1["threshold"]!=2]
ctweets=ctweets2[ctweets2["tokenized_text"].str.len()>0]
#save_object(ctweets,"ctweetspickle.pkl")

#with open(r"ctweetspickle.pkl", "rb") as input_file:
#    ctweets = pickle.load(input_file)

#The data set for measuring language at each date is uniformly distributed across the 
#time periods for that city although the dates themselves vary. The states however need 
#to be balanced because states like California and Texas are overweighted, so the dataset 
#for measuring by state is created so all the states have the same number
#of records as the state with the smallest amount
    
    
g = ctweets.groupby(['state'])
g.size().reset_index(name='counts')
f=g.apply(lambda x: x.sample(g.size().min()).reset_index())

ctweetstate=f.droplevel(0).reset_index()



#To deteremine whether there is more predictive accuracy based on the location of the 
#tweet or the timeline of COVID cases, there will be analysis run for date targets 
#(threshold) and location targets (state)



X_train_threshold,X_test_threshold,
y_train_threshold,y_test_threshold=train_test_split(
    ctweets[["tokenized_text","lemma"]],ctweets["threshold"],test_size=0.2,random_state=15)

X_train_state,X_test_state,
y_train_state,y_test_state=train_test_split(ctweetstate[["tokenized_text","lemma"]],
                                            ctweetstate["state"],test_size=0.2,
                                            random_state=15)

#Record the likelihood of a random guess to compare to later predicted results
threshold_odds=1/ctweets["threshold"].nunique()
state_odds=1/ctweetstate["state"].nunique()


#Setup first for Word2Vec vectorization, and then bag of words vectorization
#Size for both vectorizations limited to 1000 columns for states and thresholds

threshtokens=pd.Series(ctweets["tokenized_text"])
threshmodel=Word2Vec(threshtokens,size=1000,window=6,min_count=1,workers=7,sg=1)
#save_object(threshmodel,"threshmodelpickle.pkl")

with open(r"threshmodelpickle.pkl", "rb") as input_file:
    threshmodel = pickle.load(input_file)

threshmydict=corpora.Dictionary(ctweets["lemma"],prune_at=1000)
threshcorpus=[threshmydict.doc2bow(line) for line in ctweets["lemma"]]
threshtfidf_model=TfidfModel(threshcorpus)
save_object(threshtfidf_model,"threshtfidf_modelpickle.pkl")


#Models for state

statetokens=pd.Series(ctweetstate["tokenized_text"])
statemodel=Word2Vec(statetokens,size=1000,window=6,min_count=1,workers=7,sg=1)
#save_object(statemodel,"statemodelpickle.pkl")

statemydict=corpora.Dictionary(ctweetstate["lemma"],prune_at=1000)
statecorpus=[statemydict.doc2bow(line) for line in ctweetstate["lemma"]]
statetfidf_model=TfidfModel(statecorpus)
#Save_object(statetfidf_model,"statetfidf_modelpickle.pkl")


#Write vectorizations of X_train lists and X_test lists to four CSV documents 
#(eight including the state dataset) for repeated access later. FIrst the TFIDF vectorization, 
#then Word2Vec for train then test etc. In all cases setting index0 as header variable 
#because index value has been randomized


vocab_len=len(threshmydict.token2id)
index0=True
tfid_tf='twittertrainXTFID.csv'
with open(tfid_tf,'w+',encoding='utf8') as tfidf_file:
    for index, row in X_train_threshold.iterrows():
        doc=mydict.doc2bow(row['lemma'])
        features = gensim.matutils.corpus2csc([threshtfidf_model[doc]],
                                              num_terms=vocab_len).toarray()[:,0]
        if index0:
            header=",".join(str(threshmydict[ele]) for ele in range(vocab_len))
            tfidf_file.write(header)
            tfidf_file.write("\n")
            index0=False
        line1=",".join([str(vector_element) for vector_element in features])
        tfidf_file.write(line1)
        tfidf_file.write("\n")

index0=True
word2vec_tf="twittertrainX.csv"
with open(word2vec_tf,'w+')as word2vec_file:
    for index, row in X_train_threshold.iterrows():

        model_vector=(np.mean([threshmodel[token] for token in row ['tokenized_text']],
                              axis=0)).tolist()
        if index0:
            index0=False
            header=",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        if type(model_vector) is list:
            line1=",".join([str(vector_element) for vector_element in model_vector])
        else:
            line1=",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')



index0=True
tfid_tf_test='twittertestXTFID.csv'
with open(tfid_tf_test,'w+',encoding='utf8') as tfidf_file:
    for index, row in X_test_threshold.iterrows():
        doc=threshmydict.doc2bow(row['lemma'])
        features = gensim.matutils.corpus2csc([threshtfidf_model[doc]],
                                              num_terms=vocab_len).toarray()[:,0]
        if index0:
            header=",".join(str(threshmydict[ele]) for ele in range(vocab_len))
            tfidf_file.write(header)
            tfidf_file.write("\n")
            index0=False
        line1=",".join([str(vector_element) for vector_element in features])
        tfidf_file.write(line1)
        tfidf_file.write("\n")
        
index0=True
word2vec_tf_test="twittertestX.csv"
with open(word2vec_tf_test,'w+')as word2vec_file:
    for index, row in X_test_threshold.iterrows():

        model_vector=(np.mean([threshmodel[token] for token in row ['tokenized_text']],
                              axis=0)).tolist()
        if index0:
            index0=False
            header=",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        if type(model_vector) is list:
            line1=",".join([str(vector_element) for vector_element in model_vector])
        else:
            line1=",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')



#Files for the state data set


vocab_len=len(statemydict.token2id)
index0=True
statetfid_tf='statetwittertrainXTFID.csv'
with open(statetfid_tf,'w+',encoding='utf8') as tfidf_file:
    for index, row in X_train_state.iterrows():
        doc=statemydict.doc2bow(row['lemma'])
        features = gensim.matutils.corpus2csc([statetfidf_model[doc]],
                                              num_terms=vocab_len).toarray()[:,0]
        if index0:
            header=",".join(str(statemydict[ele]) for ele in range(vocab_len))
            tfidf_file.write(header)
            tfidf_file.write("\n")
            index0=False
        line1=",".join([str(vector_element) for vector_element in features])
        tfidf_file.write(line1)
        tfidf_file.write("\n")

index0=True
stateword2vec_tf="statetwittertrainX.csv"
with open(stateword2vec_tf,'w+')as word2vec_file:
    for index, row in X_train_state.iterrows():

        model_vector=(np.mean([statemodel[token] for token in row ['tokenized_text']],
                              axis=0)).tolist()
        if index0:
            index0=False
            header=",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        if type(model_vector) is list:
            line1=",".join([str(vector_element) for vector_element in model_vector])
        else:
            line1=",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')



index0=True
statetfid_tf_test='statetwittertestXTFID.csv'
with open(statetfid_tf_test,'w+',encoding='utf8') as tfidf_file:
    for index, row in X_test_state.iterrows():
        doc=statemydict.doc2bow(row['lemma'])
        features = gensim.matutils.corpus2csc([statetfidf_model[doc]],
                                              num_terms=vocab_len).toarray()[:,0]
        if index0:
            header=",".join(str(statemydict[ele]) for ele in range(vocab_len))
            tfidf_file.write(header)
            tfidf_file.write("\n")
            index0=False
        line1=",".join([str(vector_element) for vector_element in features])
        tfidf_file.write(line1)
        tfidf_file.write("\n")
        
index0=True
stateword2vec_tf_test="statetwittertestX.csv"
with open(stateword2vec_tf_test,'w+')as word2vec_file:
    for index, row in X_test_state.iterrows():

        model_vector=(np.mean([statemodel[token] for token in row ['tokenized_text']],
                              axis=0)).tolist()
        if index0:
            index0=False
            header=",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        if type(model_vector) is list:
            line1=",".join([str(vector_element) for vector_element in model_vector])
        else:
            line1=",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')






#Files stored with vectorizations of tweets
#Now read into data frames, one each for Word2Vec method and bag of words method
        
word2vec_X=pd.read_csv(word2vec_tf)
stateword2vec_X=pd.read_csv(stateword2vec_tf)

tfid_X=pd.read_csv(tfid_tf)
statetfid_X=pd.read_csv(statetfid_tf)

test_features_X=pd.read_csv(word2vec_tf_test)
statetest_features_X=pd.read_csv(stateword2vec_tf_test)

test_features_tfid=pd.read_csv(tfid_tf_test) 
statetest_features_tfid=pd.read_csv(statetfid_tf_test) 

    
#Initialize and train three classifiers, each for predicting the threshold and state, 
#and using the Word2Vec and BOW vectorizations
    
forest_thresh=RandomForestClassifier()
forest_state=RandomForestClassifier()
forest_thresh_tfid=RandomForestClassifier()
forest_state_tfid=RandomForestClassifier()
    
dt_thresh=DecisionTreeClassifier()
dt_state=DecisionTreeClassifier()
dt_thresh_tfid=DecisionTreeClassifier()
dt_state_tfid=DecisionTreeClassifier()

sgd_thresh=SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
sgd_state=SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
sgd_thresh_tfid=SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
sgd_state_tfid=SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)

forest_thresh.fit(word2vec_X,y_train_threshold)
#save_object(forest_thresh,"forest_threshpickle.pkl")

forest_state.fit(stateword2vec_X,y_train_state)
#save_object(forest_state,"forest_statepickle.pkl")

forest_thresh_tfid.fit(tfid_X,y_train_threshold)
#save_object(forest_thresh_tfid,"forest_thresh_tfidpickle.pkl")

forest_state_tfid.fit(statetfid_X,y_train_state)
#save_object(forest_state_tfid,"forest_state_tfidpickle.pkl")

dt_thresh.fit(word2vec_X,y_train_threshold)
#save_object(dt_thresh,"dt_threshpickle.pkl")

dt_state.fit(stateword2vec_X,y_train_state)
#save_object(dt_state,"dt_statepickle.pkl")

dt_thresh_tfid.fit(tfid_X,y_train_threshold)
#save_object(dt_thresh_tfid,"dt_thresh_tfidpickle.pkl")

dt_state_tfid.fit(statetfid_X,y_train_state)
#save_object(dt_state_tfid,"dt_state_tfidpickle.pkl")

sgd_thresh.fit(word2vec_X,y_train_threshold)
#save_object(sgd_thresh,"sgd_threshpickle.pkl")

sgd_state.fit(stateword2vec_X,y_train_state)
#save_object(sgd_state,"sgd_statepickle.pkl")

sgd_thresh_tfid.fit(tfid_X,y_train_threshold)
#save_object(sgd_thresh_tfid,"sgd_thresh_tfidpickle.pkl")

sgd_state_tfid.fit(statetfid_X,y_train_state)
#save_object(sgd_state_tfid,"sgd_state_tfidpickle.pkl")



#Run test features through trained models for each of the estimators. Because the classes
#for the date threshold and the state are both non-binary, the precision, recall 
#and f1 scores are all the same

#If there is enough RAM, it is not necessary to load from pickles and delete after prediction


#with open(r"sgd_threshpickle.pkl", "rb") as input_file:
#    sgd_thresh = pickle.load(input_file)

#with open(r"sgd_statepickle.pkl", "rb") as input_file:
#    sgd_state = pickle.load(input_file)

test_predictions_thresh= sgd_thresh.predict(test_features_X)
test_predictions_state= sgd_state.predict(statetest_features_X)
    
#del sgd_thresh
#del sgd_state
    
    
#with open(r"sgd_thresh_tfidpickle.pkl", "rb") as input_file:
#    sgd_thresh_tfid = pickle.load(input_file)

#with open(r"sgd_state_tfidpickle.pkl", "rb") as input_file:
#    sgd_state_tfid = pickle.load(input_file)


sgdtfid_threshpredict=sgd_thresh_tfid.predict(test_features_tfid)
sgdtfid_statepredict=sgd_state_tfid.predict(statetest_features_tfid)

#del sgd_state_tfid
#del sgd_thresh_tfid


#with open(r"dt_threshpickle.pkl", "rb") as input_file:
#    dt_thresh = pickle.load(input_file)

#with open(r"dt_statepickle.pkl", "rb") as input_file:
#    dt_state = pickle.load(input_file)
    
treep_thresh=dt_thresh.predict(test_features_X)
treep_state=dt_state.predict(statetest_features_X)

#del dt_thresh
#del dt_state

#with open(r"dt_thresh_tfidpickle.pkl", "rb") as input_file:
#    dt_thresh_tfid = pickle.load(input_file)

#with open(r"dt_state_tfidpickle.pkl", "rb") as input_file:
#    dt_state_tfid = pickle.load(input_file)


dt_tfid_thresh_pred=dt_thresh_tfid.predict(test_features_tfid)
dt_tfid_state_pred=dt_state_tfid.predict(statetest_features_tfid)

#del dt_thresh_tfid
#del dt_state_tfid


#with open(r"forest_threshpickle.pkl", "rb") as input_file:
#    forest_thresh = pickle.load(input_file)
    
forest_thresh_predict=forest_thresh.predict(test_features_X)

#del forest_thresh

#with open(r"forest_statepickle.pkl", "rb") as input_file:
#    forest_state = pickle.load(input_file)

forest_state_predict=forest_state.predict(statetest_features_X)

#del forest_state

#with open(r"forest_thresh_tfidpickle.pkl", "rb") as input_file:
#    forest_thresh_tfid = pickle.load(input_file)

forest_thresh_tfid_predict=forest_thresh_tfid.predict(test_features_tfid)

#del forest_thresh_tfid
    
#with open(r"forest_state_tfidpickle.pkl", "rb") as input_file:
#    forest_state_tfid = pickle.load(input_file)

forest_state_tfid_predict=forest_state_tfid.predict(statetest_features_tfid)

#Results for training methods and targets compared to random likelihood


print("Likeliehood of predicting correct threshold at random: {:.4f}".format(threshold_odds))
print("Likeliehood of predicting correct state at random: {:.4f}".format(state_odds))

f1 = f1_score(y_test_threshold,treep_thresh,average="micro")
print("DecisionTree Threshold F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                            (f1-threshold_odds)))

f1 = f1_score(y_test_state,treep_state,average="micro")
print("DecisionTree State F1: {:.4f}, {:.4f} better than random".format(f1, (f1-state_odds)))

f1 = f1_score(y_test_threshold,dt_tfid_thresh_pred,average="micro")
print("DecisionTree Threshold TFID F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                                 (f1-threshold_odds)))

f1 = f1_score(y_test_state,dt_tfid_state_pred,average="micro")
print("DecisionTree State TFID F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                             (f1-state_odds)))

f1 = f1_score(y_test_threshold,forest_thresh_predict,average="micro")
print("Random Forest Threshold F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                             (f1-threshold_odds)))

f1 = f1_score(y_test_state,forest_state_predict,average="micro")
print("Random Forest State F1: {:.4f}, {:.4f} better than random".format(f1, (f1-state_odds)))

f1 = f1_score(y_test_threshold,forest_thresh_tfid_predict,average="micro")
print("Random Forest Threshold TFID F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                                  (f1-threshold_odds)))

f1 = f1_score(y_test_state,forest_state_tfid_predict,average="micro")
print("Random Forest State TFID F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                              (f1-state_odds)))

f1 = f1_score(y_test_threshold,test_predictions_thresh,average="micro")
print("SGD Threshold F1: {:.4f}, {:.4f} better than random".format(f1, (f1-threshold_odds)))

f1 = f1_score(y_test_state,test_predictions_state,average="micro")
print("SGD State F1: {:.4f}, {:.4f} better than random".format(f1, (f1-state_odds)))

f1 = f1_score(y_test_threshold,sgdtfid_threshpredict,average="micro")
print("SGD Threshold TFID F1: {:.4f}, {:.4f} better than random".format(f1, 
                                                                        (f1-threshold_odds)))

f1 = f1_score(y_test_state,sgdtfid_statepredict,average="micro")
print("SGD State TFID F1: {:.4f}, {:.4f} better than random".format(f1, (f1-state_odds)))





  model_vector=(np.mean([threshmodel[token] for token in row ['tokenized_text']],axis=0)).tolist()


Likeliehood of predicting correct threshold at random: 0.3333
Likeliehood of predicting correct state at random: 0.0667
DecisionTree Threshold F1: 0.3283, -0.0050 better than random
DecisionTree State F1: 0.0662, -0.0004 better than random
DecisionTree Threshold TFID F1: 0.3820, 0.0486 better than random
DecisionTree State TFID F1: 0.1534, 0.0867 better than random
Random Forest Threshold F1: 0.3354, 0.0021 better than random
Random Forest State F1: 0.0674, 0.0007 better than random
Random Forest Threshold TFID F1: 0.3945, 0.0612 better than random
Random Forest State TFID F1: 0.1774, 0.1108 better than random
SGD Threshold F1: 0.3347, 0.0014 better than random
SGD State F1: 0.0702, 0.0035 better than random
SGD Threshold TFID F1: 0.3708, 0.0375 better than random
SGD State TFID F1: 0.1588, 0.0921 better than random
