# K Nearest Neighbors
University of Denver

Makarand Nadendla

In [2]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np 
import pandas as pd 
import re, string

In [45]:
df_income_evaluation_cat = pd.read_csv("income_evaluation_cat.csv")
df_income_evaluation_cat.head()

Unnamed: 0,workclass,education,race,gender,income
0,State-gov,Bachelors,White,Male,<=50K
1,Self-emp-not-inc,Bachelors,White,Male,<=50K
2,Private,HS-grad,White,Male,<=50K
3,Private,11th,Black,Male,<=50K
4,Private,Bachelors,Black,Female,<=50K


In [46]:
for var in df_income_evaluation_cat.columns.values:
    print(df_income_evaluation_cat[var].unique())
    print()

[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']

[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']

[' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']

[' Male' ' Female']

[' <=50K' ' >50K']



In [47]:
X = [" Private"," Bachelors"," White", " Female"]

So the Bayes Theorom would be:

P(Yi/x) = P(X/Yi)*P(Yi)
then find the max of the P(Yi/X) and select your answer.

In [48]:
def n_bayes_cat(training, test, outcome_col):
    
    post_probability = {}
    
    for outcome in training[outcome_col].unique():
        p_yi = training[outcome_col].value_counts()[outcome]/training.shape[0]
        p_x_yi = 1
        
        # p_x_yi calculations
        training_yi = training[training[outcome_col] == outcome]
        for i,x in enumerate(test):
            partial_prob = sum(training_yi.iloc[:,i] == x)/training_yi.shape[0]
            p_x_yi = p_x_yi*partial_prob
      
        p_yi_x = p_x_yi*p_yi
        post_probability[outcome]=p_yi_x
    
    pred = max(post_probability, key=post_probability.get)
    return pred


In [49]:
n_bayes_cat(df_income_evaluation_cat, X, ' income')

' <=50K'

In [50]:
df_income_evaluation_cat = pd.get_dummies(df_income_evaluation_cat, columns=[" workclass"," education"," race","gender"])

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df_income_evaluation_cat.drop(" income", axis = 1),df_income_evaluation_cat[" income"])

In [52]:
NB = CategoricalNB()
NB.fit(X_train, y_train)
y_test_pred = NB.score(X_test,y_test)
y_train_pred = NB.score(X_train, y_train)

In [53]:
print(y_test_pred, y_train_pred)

0.7791426114727921 0.7875921375921376


In [54]:
df_income_evaluation_cont = pd.read_csv("income_evaluation_continuous.csv")
df_income_evaluation_cont.apply(func = ["mean","std"])

Unnamed: 0,age,education_num,hours_per_week
mean,38.581647,10.080679,40.437456
std,13.640433,2.57272,12.347429


In [55]:
X = [30, 10, 45]

In [56]:
def n_bayes_cont(training, test, outcome_col):
    
    post_probability = {}
    
    for outcome in training[outcome_col].unique():
        p_yi = training[outcome_col].value_counts()[outcome]/training.shape[0]
        p_x_yi = 1
        
        # p_x_yi calculations
        training_yi = training[training[outcome_col] == outcome]
        for i,x in enumerate(test):
            mean = np.mean(training_yi.iloc[:,i])
            std = np.std(training_yi.iloc[:,i])
            partial_prob = norm.pdf(x, mean, std)
            p_x_yi = p_x_yi*partial_prob
      
        p_yi_x = p_x_yi*p_yi
        post_probability[outcome]=p_yi_x

    pred = max(post_probability, key=post_probability.get)
    return pred


In [57]:
n_bayes_cont(df_income_evaluation_cont, X, " income")

' <=50K'

In [58]:
X_train, X_test, y_train, y_test = train_test_split(df_income_evaluation_cont.drop(" income", axis = 1),df_income_evaluation_cont[" income"])
NB_model = Pipeline(steps = [("Scaler", StandardScaler()),("Model", GaussianNB())])
NB_model.fit(X_train, y_train)
print(NB_model.score(X_train, y_train), NB_model.score(X_test, y_test))


0.8011056511056511 0.7957253408672154


In [39]:
df_true = pd.read_csv("True.csv")
df_true["news_type"] = pd.Series(df_true.shape[0]*"True".split())

In [40]:
df_fake = pd.read_csv("Fake.csv")
df_fake["news_type"] = pd.Series(df_fake.shape[0]*"Fake".split())

In [45]:
df_total = pd.concat([df_true,df_fake])
df_total["news"] =  df_total["title"] + " " + df_total["text"]
df_total[df_total["news"].apply(lambda x: len(x) < 50)] 
df_total.drop(labels=["title", "text"],axis=1 ,inplace = True)

In [46]:
df_total = df_total.reindex(["date","subject","news","news_type"], axis = 1)
df_total

Unnamed: 0,date,subject,news,news_type
0,31-Dec-17,politicsNews,"As U.S. budget fight looms, Republicans flip t...",True
1,29-Dec-17,politicsNews,U.S. military to accept transgender recruits o...,True
2,31-Dec-17,politicsNews,Senior U.S. Republican senator: 'Let Mr. Muell...,True
3,30-Dec-17,politicsNews,FBI Russia probe helped by Australian diplomat...,True
4,29-Dec-17,politicsNews,Trump wants Postal Service to charge 'much mor...,True
...,...,...,...,...
23476,"January 16, 2016",Middle-east,McPain: John McCain Furious That Iran Treated ...,Fake
23477,"January 16, 2016",Middle-east,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,Fake
23478,"January 15, 2016",Middle-east,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Fake
23479,"January 14, 2016",Middle-east,How to Blow $700 Million: Al Jazeera America F...,Fake


In [47]:
class contraction_replacer(object):
    def __init__(self, contraction_patterns):
        self._contraction_regexes = [(re.compile(p), replaced_text) for p, replaced_text in contraction_patterns]

    def do_contraction_normalization(self, text):
        for contraction_regex, replaced_text in self._contraction_regexes:
            text = contraction_regex.sub(replaced_text, text)
        return text

def preprocess_text(text):
    
    edited_contraction_patterns = [(r'won\'t', 'will not'), (r'can\'t', 'cannot'),
    (r'haven\'t', 'have not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)\'re', '\g<1> are'), (",",""), ("'","")]

    replacer = contraction_replacer(edited_contraction_patterns)
    text = replacer.do_contraction_normalization(text)
    text = word_tokenize(text)

    stop_words = set(stopwords.words('english')) 
    
    # lemmatization
    wnlemma = WordNetLemmatizer()
    lemma_text = []
    for word in text:
        if word in string.punctuation:
            continue
        if word in stop_words:
            continue
        lemma_text.append(wnlemma.lemmatize(word))
    lemma_text = " ".join(lemma_text)

    return lemma_text

df_total["news"] = df_total["news"].apply(preprocess_text)

In [48]:
vectorizer = TfidfVectorizer()
df_total_subset = df_total.sample(5000)
X = vectorizer.fit_transform(df_total_subset["news"]).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, df_total_subset["news_type"], test_size=0.3)

In [49]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
print(NB_model.score(X_train, y_train), NB_model.score(X_test, y_test))

0.9614285714285714 0.9333333333333333


In [50]:
cv_results = cross_validate(NB_model, X, df_total_subset["news_type"], cv=10)
print(np.mean(cv_results['test_score']), np.std(cv_results['test_score']))

0.9414 0.0072691127381544806


In [52]:
search = GridSearchCV(NB_model, param_grid={"alpha":[k for k in np.linspace(0,1.0,num=11)], "fit_prior":[True,False]})
search.fit(X_train, y_train)
search.best_params_

{'alpha': 0.4, 'fit_prior': False}

In [14]:
df_football = pd.read_csv("football_feed_df_final.csv")

In [17]:
df_football= df_football[["full_text","team"]]

In [27]:
df_football.dropna(inplace=True)

In [37]:
vectorizer = TfidfVectorizer()
#df_football_subset = df_football.sample(5000)
df_football["full_text"].apply(preprocess_text)
X = vectorizer.fit_transform(df_football["full_text"]).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, df_football["team"], test_size=0.3)

In [30]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
print(NB_model.score(X_train, y_train), NB_model.score(X_test, y_test))

0.9494917407878017 0.7422222222222222


In [38]:
search = GridSearchCV(NB_model, param_grid={"alpha":[k for k in np.linspace(0,1.0,num=11)], "fit_prior":[True,False]})
search.fit(X_train, y_train)
search.best_params_
print(search.score(X_train, y_train), search.score(X_test, y_test))

0.968869123252859 0.7762962962962963
