In [182]:
import pandas as pd
import numpy as np
import glob

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from nltk.metrics import *
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from itertools import islice

import collections
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 50)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [4]:
path = './Articles'
files = glob.glob(path+'/*.csv')

li = []

for file in files:
    df = pd.read_csv(file, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)
data = data.drop(data.columns[0], axis=1)

In [5]:
data.sample(10)

Unnamed: 0,id,title,publication,author,date,year,month,url,content
8908,27644,George Soros-Backed Climate March Brings Celeb...,Breitbart,Penny Starr,2017-04-28,2017.0,4.0,,Protesters at last Saturday’s March for Scienc...
6775,25153,Thai Leader Links Attacks on Tourist Sites to ...,New York Times,Richard C. Paddock,2016-08-17,2016.0,8.0,,MANILA — The head of the Thai junta urged h...
50611,152710,"A postcard from Durham, keeping it dirty in No...",Guardian,,2016-05-29,2016.0,5.0,https://www.theguardian.com/books/2016/may/29/...,"They call it “Bull City,” though, with the exc..."
90043,214838,Abdul-Jabbar: Insulting Colin Kaepernick says ...,Washington Post,Kareem Abdul-Jabbar,2016-08-30,2016.0,8.0,https://web.archive.org/web/20160831002640/htt...,During the Olympics in Rio a couple of we...
14386,33123,Gainor: Conservatives Abused by ‘Media Who Don...,Breitbart,Dan Riehl,2017-03-27,2017.0,3.0,,Vice President of Business and Culture at the ...
134344,140923,Kate Spade on the hunt for a potential buyer,New York Post,Carleton English and Lisa Fickenscher,2016-12-28,2016.0,12.0,http://nypost.com/2016/12/28/kate-spade-on-hun...,It’s a world. Kate Spade is shopping for a ...
80574,202433,California wants to regulate cow belches. It’s...,Vox,Brad Plumer,2016/9/27,2016.0,9.0,http://www.vox.com/2016/9/27/13072714/californ...,Now this is a funny headline: ”Cow farts can ...
70484,185091,Exclusive: Republican politicians jumped the g...,Reuters,Mark Hosenball and Julia Harte,2016-01-08,2016.0,1.0,http://www.reuters.com/article/us-usa-security...,Two Texas politicians made public details of ...
47549,70394,Bernie Sanders overplayed his hand,Business Insider,,2016-07-03,2016.0,7.0,,"’ ’ ’ On Monday, Hillary Clinton and Eliz..."
58474,166506,Fifth-Graders Revisit King’s ’Dream’ Speech At...,NPR,Byrd Pinkerton,2016-01-18,2016.0,1.0,http://www.npr.org/sections/ed/2016/01/18/4632...,"On Friday, the from Watkins Elementary Schoo..."


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142570 entries, 0 to 142569
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           142570 non-null  int64  
 1   title        142568 non-null  object 
 2   publication  142570 non-null  object 
 3   author       126694 non-null  object 
 4   date         139929 non-null  object 
 5   year         139929 non-null  float64
 6   month        139929 non-null  float64
 7   url          85559 non-null   object 
 8   content      142570 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 9.8+ MB


In [7]:
data['title'] = data['title'].fillna('')

In [8]:
clinton_data = data[(data['year'] == 2016) &
                    (data['title'].str.contains('Hillary Clinton'))]
clinton_data = clinton_data[(clinton_data['publication'] == 'Washington Post') | 
                            (clinton_data['publication'] == 'New York Times')]
clinton_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329 entries, 2614 to 92416
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           329 non-null    int64  
 1   title        329 non-null    object 
 2   publication  329 non-null    object 
 3   author       327 non-null    object 
 4   date         329 non-null    object 
 5   year         329 non-null    float64
 6   month        329 non-null    float64
 7   url          175 non-null    object 
 8   content      329 non-null    object 
dtypes: float64(2), int64(1), object(6)
memory usage: 25.7+ KB


In [9]:
pd.set_option('display.max_colwidth', 500)
clinton_data[['title', 'publication', 'author']].sample(5).reset_index()

Unnamed: 0,index,title,publication,author
0,3918,Did Hillary Clinton Have to Be First? - The New York Times,New York Times,Michael Barbaro
1,7551,"Your Monday Evening Briefing: Hillary Clinton, Cleveland Cavaliers, Kanye West - The New York Times",New York Times,Jonah Engel Bromwich and Sandra Stevenson
2,85365,Hillary Clinton’s campaign raised $55 million in the latest fundraising quarter,Washington Post,Anne Gearan
3,90737,53 percent of people in new WaPo-ABC poll say Hillary Clinton won the debate,Washington Post,Scott Clement
4,91274,Republicans are already plotting the next war against Hillary Clinton,Washington Post,Editorial Board


In [10]:
pd.set_option('display.max_colwidth', 50)

In [11]:
# clinton_data_1 = clinton_data[0:110]
# clinton_data_2 = clinton_data[110:220]
# clinton_data_3 = clinton_data[220:]

In [12]:
# clinton_data_1.to_csv('clinton_data_1.csv', index=False)
# clinton_data_2.to_csv('clinton_data_2.csv', index=False)
# clinton_data_3.to_csv('clinton_data_3.csv', index=False)

In [13]:
# li = []
# clinton_data_1_gender = pd.read_csv('clinton_data_1_gender.csv', index_col=None, header=0)
# clinton_data_2_gender = pd.read_csv('clinton_data_2_gender.csv', index_col=None, header=0)
# clinton_data_3_gender = pd.read_csv('clinton_data_3_gender.csv', index_col=None, header=0)
# li.append(clinton_data_1_gender)
# li.append(clinton_data_2_gender)
# li.append(clinton_data_3_gender)

# clinton_data_gender = pd.concat(li, axis=0, ignore_index=True)

In [256]:
clinton_data_gender = pd.read_csv('clinton_data_gender.csv', index_col=None, header=0)
clinton_data_gender.sample(5)

Unnamed: 0,id,title,publication,author,gender,date,year,month,url,content
93,23360,Hillary Clinton Makes Dire Predictions for Eco...,New York Times,Matt Flegenheimer and Amy Chozick,B,2016-06-22,2016.0,6.0,,"COLUMBUS, Ohio — Hillary Clinton pounded aw..."
70,22160,Donations to Foundation Vexed Hillary Clinton’...,New York Times,Steve Eder and Amy Chozick,B,2016-12-16,2016.0,12.0,,In the years before Hillary Clinton announced ...
227,212662,"47 years ago, Hillary Clintonâ€™s practice nom...",Washington Post,Alyssa Rosenberg,F,2016-06-10,2016.0,6.0,https://web.archive.org/web/20160611000057/htt...,Alyssa Rosenberg writes about politics and ...
279,214913,How Hillary Clinton helped create what she lat...,Washington Post,Karen Tumulty,F,2016-09-02,2016.0,9.0,https://web.archive.org/web/20160903001748/htt...,The epic battles between the Clintons and thei...
164,209400,"Top Democrats, and a little bit of Hollywood, ...",Washington Post,Philip Rucker,M,2016-01-22,2016.0,1.0,https://web.archive.org/web/20160123003737/htt...,DES MOINES — With another Iowa presidenti...


In [257]:
clinton_data_gender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329 entries, 0 to 328
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           329 non-null    int64  
 1   title        329 non-null    object 
 2   publication  329 non-null    object 
 3   author       328 non-null    object 
 4   gender       329 non-null    object 
 5   date         329 non-null    object 
 6   year         329 non-null    float64
 7   month        329 non-null    float64
 8   url          175 non-null    object 
 9   content      329 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 25.8+ KB


In [258]:
clinton_data_gender['gender'].value_counts()

M    185
F    101
B     43
Name: gender, dtype: int64

In [137]:
clinton_data_binary = clinton_data_gender[clinton_data_gender['gender']!='B']

### PMI

In [138]:
stopwords_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", 
                  "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 
                  'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 
                  'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 
                  'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
                  'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
                  'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 
                  'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 
                  'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 
                  'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'own', 'same', 
                  'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now', 'd', 
                  'll', 'm', 'o', 're', 've', 'y', 'ma']

In [139]:
all_words_sw = [word.lower()
                for review in clinton_data_gender['content']
                for word in RegexpTokenizer(r'\w+').tokenize(review)
                if not word.lower() in stopwords_list
               ]

finder = BigramCollocationFinder.from_words(all_words_sw)
bgm = BigramAssocMeasures()
score = bgm.mi_like  # metric options: pmi or mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

list(islice(collocations.items(), 30)) # return word pairs with highest scores

[('mrs_clinton', 496.64757026191324),
 ('mr_trump', 232.99157587935437),
 ('hillary_clinton', 96.88607581590928),
 ('new_york', 81.86635997699827),
 ('white_house', 76.48245366184449),
 ('united_states', 58.13021958489431),
 ('secretary_state', 55.483533795542584),
 ('f_b', 38.25947031368718),
 ('washington_post', 35.24855172413793),
 ('briefing_posted', 34.53267136450993),
 ('nytimes_com', 32.6530612244898),
 ('wall_street', 26.853715728715727),
 ('state_department', 25.667643707072678),
 ('donald_trump', 25.469081929029453),
 ('bernie_sanders', 24.55971479500891),
 ('donald_j', 24.113299480466114),
 ('los_angeles', 23.24867724867725),
 ('running_mate', 19.167626728110598),
 ('judicial_watch', 19.140625),
 ('super_pac', 18.14589104339797),
 ('mr_sanders', 15.735150176745794),
 ('mitt_romney', 15.68),
 ('ted_cruz', 14.709898242368178),
 ('private_email', 14.558537646888148),
 ('supreme_court', 14.390056818181819),
 ('classified_information', 14.361702127659575),
 ('minimum_wage', 13.88

In [140]:
all_words_sw_m = [word.lower()
                for review in clinton_data_gender[clinton_data_gender['gender']=='M']['content']
                for word in RegexpTokenizer(r'\w+').tokenize(review)
                if not word.lower() in stopwords_list
               ]

finder = BigramCollocationFinder.from_words(all_words_sw_m)
bgm = BigramAssocMeasures()
score = bgm.mi_like  # metric options: pmi or mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

list(islice(collocations.items(), 30)) # return word pairs with highest scores

[('mrs_clinton', 193.24399788447388),
 ('mr_trump', 71.89349112426035),
 ('hillary_clinton', 50.4338525390625),
 ('united_states', 33.920771052846526),
 ('new_york', 33.84222253494699),
 ('f_b', 32.82005674378001),
 ('secretary_state', 32.60491537213077),
 ('white_house', 23.06057672917927),
 ('washington_post', 19.639226914817467),
 ('state_department', 19.315151215166065),
 ('classified_information', 16.915597433841217),
 ('judicial_watch', 16.568181818181817),
 ('donald_trump', 16.129712903225805),
 ('bernie_sanders', 14.282162026821903),
 ('donald_j', 13.638330170777989),
 ('private_email', 12.531621392190154),
 ('george_w', 11.663797226207556),
 ('w_bush', 10.771170183611948),
 ('email_server', 10.702451612903225),
 ('wall_street', 10.176923076923076),
 ('ted_cruz', 10.139166666666666),
 ('des_moines', 10.0),
 ('new_hampshire', 8.694830659536542),
 ('running_mate', 8.556962025316455),
 ('los_angeles', 8.333333333333334),
 ('park_ridge', 8.333333333333334),
 ('ronald_reagan', 7.846

In [141]:
all_words_sw_f = [word.lower()
                for review in clinton_data_gender[clinton_data_gender['gender']=='F']['content']
                for word in RegexpTokenizer(r'\w+').tokenize(review)
                if not word.lower() in stopwords_list
               ]

finder = BigramCollocationFinder.from_words(all_words_sw_f)
bgm = BigramAssocMeasures()
score = bgm.mi_like  # metric options: pmi or mi_like
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

list(islice(collocations.items(), 30)) # return word pairs with highest scores

[('mrs_clinton', 86.77714312541079),
 ('mr_trump', 63.27936746987952),
 ('hillary_clinton', 46.59589689439296),
 ('white_house', 38.99701549403099),
 ('briefing_posted', 29.242389175716575),
 ('new_york', 26.11917098445596),
 ('united_states', 22.32428842504744),
 ('washington_post', 21.49547803617571),
 ('nytimes_com', 21.16),
 ('secretary_state', 19.899764150943398),
 ('miss_morning', 15.20875),
 ('first_lady', 12.320175438596491),
 ('wall_street', 12.05231037489102),
 ('bernie_sanders', 11.708889690845663),
 ('posted_6', 11.304994192799072),
 ('donald_trump', 10.456110154905335),
 ('p_eastern', 10.436029097133076),
 ('los_angeles', 9.090909090909092),
 ('planned_parenthood', 8.923747276688454),
 ('ms_warren', 8.097165991902834),
 ('evening_latest', 7.695762175838077),
 ('posted_weekdays', 6.451219512195122),
 ('bill_clinton', 6.410081148564295),
 ('window_interactiveomniture', 6.0),
 ('sexual_assault', 5.951020408163266),
 ('super_pac', 5.76),
 ('latest_1', 5.63088313061872),
 ('la_

### Pos tagging

In [102]:
import string
def clean_text(unprocessed_string):
    stop_words = stopwords.words()
    cleaned_text = ""
    unprocessed_string = np.str.lower(unprocessed_string)
    unprocessed_string = np.str.replace(unprocessed_string, "'", "")

    text_tokens = word_tokenize(unprocessed_string)
    for word in text_tokens:
        if word not in string.punctuation:
            if word not in stop_words:
                if len(word) > 1:
                    cleaned_text = cleaned_text + " " + word
    cleaned_text = ("").join(cleaned_text)
    return cleaned_text

In [237]:
article_contents = clinton_data_binary['content'].reset_index(drop=True).copy()

for i in range(len(article_contents)):
    processed_article = clean_text(article_contents[i])
    processed_article = pos_tag(word_tokenize(processed_article))
    for j in range(len(processed_article)):
        processed_article[j] = processed_article[j][0] + '-' + processed_article[j][1]
    article_contents[i] = TreebankWordDetokenizer().detokenize(processed_article)

In [238]:
print(article_contents[0][:1000])

hillary-JJ clinton-NN advisers-NNS allies-NNS begun-VBN extensive-JJ discussions-NNS running-VBG mate-NN seeking-VBG compile-JJ list-NN 15-CD 20-CD potential-JJ picks-NNS team-VBP start-RB vetting-VBG late-JJ spring-NN mrs.-NN clinton-NN team-NN grapple-NN complicated-VBD questions-NNS like-IN whether-IN united-JJ states-NNS ready-JJ ticket-NN whether-IN choice-NN vice-NN president-NN would-MD able-JJ handle-VB working-VBG white-JJ house-NN former-JJ president-NN bill-NN clinton-NN wielded-VBD significant-JJ influence-NN policy-NN nomination-NN fight-NN still-RB fluid-VBD mrs.-JJ clinton-NN confident-JJ enough-RB victory-NN described-JJ vision-NN running-VBG mate-JJ objectives-NNS search-VBP according-VBG campaign-NN advisers-NNS dozen-NN democrats-NNS close-VBP campaign-NN clintons-NNS mind-NN said-VBD intrigued-JJ several-JJ contenders-NNS scenarios-NNS among-IN names-NNS discussion-VBP mrs.-FW clinton-NN mr.-VBZ clinton-NNP campaign-NN advisers-NNS senators-NNS tim-VBP kaine-JJ mark

### Run logistic regression and naive bayes with TF-IDF

In [47]:
def print_scores(scores):
    k = len(scores['test_precision_macro'])
    print('precision_macro:    ' + str(sum(scores['test_precision_macro']) / k))
    print('recall_macro:       ' + str(sum(scores['test_recall_macro']) / k))
    print('f1_macro:           ' + str(sum(scores['test_f1_macro']) / k))
    print('precision_weighted: ' + str(sum(scores['test_precision_weighted']) / k))
    print('recall_weighted:    ' + str(sum(scores['test_recall_weighted']) / k))
    print('f1_weighted:        ' + str(sum(scores['test_f1_weighted']) / k))
    
scoring = ['precision_macro', 'recall_macro', 'f1_macro',
           'precision_weighted', 'recall_weighted', 'f1_weighted']

In [259]:
X = clinton_data_binary['content']
y = clinton_data_binary['gender']

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=10000)
vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

lr_model = LogisticRegression(C=10000)

scores = cross_validate(lr_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.7541614975590833
recall_macro:       0.7060553410553411
f1_macro:           0.703721845777503
precision_weighted: 0.7610779554213554
recall_weighted:    0.772474289171204
f1_weighted:        0.745451430879626


In [260]:
nb_model = MultinomialNB(alpha=1)

scores = cross_validate(nb_model, X_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.425720551378446
recall_macro:       0.5095238095238095
f1_macro:           0.41185481384222095
precision_weighted: 0.49380015753236695
recall_weighted:    0.6537810042347247
f1_weighted:        0.5229231679720593


In [261]:
lr_model = LogisticRegression(C=10000)
lr_model.fit(X_tfidf, y)

feature_names = vectorizer.get_feature_names() 
coefs_with_fns = sorted(zip(lr_model.coef_[0], feature_names)) 
coef_word=pd.DataFrame(coefs_with_fns)
coef_word.columns='coefficient','word'
most_pos = coef_word.sort_values(by='coefficient', ascending=True).head(10).reset_index(drop=True)
most_neg = coef_word.sort_values(by='coefficient', ascending=False).head(10).reset_index(drop=True)
pd.concat([most_pos, most_neg], axis=1)

Unnamed: 0,coefficient,word,coefficient.1,word.1
0,-15.451358,women,11.380365,that
1,-14.912539,_____,10.290254,debate
2,-9.625111,foundation,9.727668,is
3,-9.137227,bill clinton,7.52966,gay
4,-7.866496,clinton said,5.6747,on the
5,-7.501246,mr sanders,5.647106,giuliani
6,-7.387173,bill,5.629507,cyrus
7,-7.224142,posted,5.487058,mr
8,-7.001687,her,5.389282,democratic
9,-6.700179,______,5.351508,the debate


### Logistic regression with pos tagging

In [263]:
X_pos_tag = article_contents
y = clinton_data_binary['gender']

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=20000, token_pattern=r"(?u)\b\w[\w-]*\w\b")
vectorizer.fit(X_pos_tag)
X_pos_tag_tfidf = vectorizer.transform(X_pos_tag)

lr_model = LogisticRegression(C=10000)

scores = cross_validate(lr_model, X_pos_tag_tfidf, y, scoring=scoring, cv=5)
print_scores(scores)

precision_macro:    0.7824773099146085
recall_macro:       0.6773423423423425
f1_macro:           0.6665972325192714
precision_weighted: 0.7742459342401525
recall_weighted:    0.7585601935874168
f1_weighted:        0.7176235569638868


In [264]:
lr_model = LogisticRegression(C=10000)
lr_model.fit(X_pos_tag_tfidf, y)

feature_names = vectorizer.get_feature_names() 
coefs_with_fns = sorted(zip(lr_model.coef_[0], feature_names)) 
coef_word=pd.DataFrame(coefs_with_fns)
coef_word.columns='coefficient','word'
most_pos = coef_word.sort_values(by='coefficient', ascending=True).head(10).reset_index(drop=True)
most_neg = coef_word.sort_values(by='coefficient', ascending=False).head(10).reset_index(drop=True)
pd.concat([most_pos, most_neg], axis=1)

Unnamed: 0,coefficient,word,coefficient.1,word.1
0,-14.830619,women-nns,9.330104,debate-nn
1,-9.239258,foundation-nn,5.050204,democratic-jj
2,-7.900778,clinton-nn said-vbd,5.041502,gay-nn
3,-7.635383,bill-nn,4.636914,mr
4,-6.9942,posted-vbd,4.510755,department-nn
5,-6.931546,clintons-nns,4.283355,sanders-nns
6,-6.78014,cd,4.219162,americans-nns
7,-6.518092,millennials-nns,4.173443,vote-nn
8,-5.727692,markings-nns,4.028605,results-nns
9,-5.605606,bill-nn clinton-nn,4.01586,presidential-jj debate-nn


In [175]:
# # cross validation
# n_splits = 10
# kf = KFold(n_splits=n_splits)
# cv_accuracy = []
# cv_precision = []
# cv_recall = []
# for train, test in kf.split(featuresets):
#     train_data = np.array(featuresets)[train]
#     test_data = np.array(featuresets)[test]
    
#     prediction = []
#     classifier = nltk.NaiveBayesClassifier.train(train_data)
    
#     for i in range(len(test_data)):
#         prediction.append(classifier.classify(test_data[i][0]))
    
#     cv_accuracy.append(accuracy([i[1] for i in test_data], prediction))
#     cv_precision.append(precision(set([i[1] for i in test_data]), set(prediction)))
#     cv_recall.append(recall(set([i[1] for i in test_data]), set(prediction)))
#     # note: in nltk package, accuracy takes lists as inputs, and precision recall take sets as inputs

# avg_accuracy = sum(cv_accuracy)/n_splits
# avg_precision = sum(cv_precision)/n_splits
# avg_recall = sum(cv_recall)/n_splits

# print(str(n_splits) + '-fold cross validation')
# print('Accuracy')
# for i in cv_accuracy:
#     print(i)
    
# print('Precision')
# for i in cv_precision:
#     print(i)
    
# print('Recall')
# for i in cv_recall:
#     print(i)
    
# print('Average accuracy: ' + str(avg_accuracy))
# print('Average precision: ' + str(avg_precision))
# print('Average recall: ' + str(avg_recall))