In [11]:
# Loading csvs
import pandas as pd
import numpy as np
from urllib.parse import urlparse

false = pd.read_csv('politifact_fake.csv', usecols=[1], skiprows=[0], na_values=['nan'], header=None, skip_blank_lines=True)
real = pd.read_csv('politifact_real.csv', usecols=[1], skiprows=[0], na_values=['nan'], header=None, skip_blank_lines=True)

# For some reason, the skip_blank_lines parameter wasn't working, so this is needed:
false.dropna(inplace=True)
real.dropna(inplace=True)


print(false.head())
print(real.head())


                                                   1
0          speedtalk.com/forum/viewtopic.php?t=51650
1  politics2020.info/index.php/2018/03/13/court-o...
2  www.nscdscamps.org/blog/category/parenting/467...
3  https://howafrica.com/oscar-pistorius-attempts...
4  http://washingtonsources.org/trump-votes-for-d...
                                                   1
0                          http://www.nfib-sbet.org/
1  http://www.cq.com/doc/newsmakertranscripts-494...
2  https://web.archive.org/web/20080204072132/htt...
3  https://web.archive.org/web/20110811143753/htt...
4  https://web.archive.org/web/20070820164107/htt...


In [12]:
def url_preprocess(data, label):
    full_set = []
    for i, row in data.iterrows():
        trimmed_url = urlparse(data.at[i, 1])

        #special case where the actual url is after an archive link
        if trimmed_url.netloc == 'web.archive.org':
            if 'http://' in trimmed_url.path:  
                trimmed_url = trimmed_url.path.split('http://')[1]
                trimmed_url = trimmed_url.split('/')[0]
                cur = [trimmed_url, label]
                full_set.append(cur)

            elif 'https://' in trimmed_url.path:
                trimmed_url = trimmed_url.path.split('https://')[1]
                trimmed_url = trimmed_url.split('/')[0]
                cur = [trimmed_url, label]
                full_set.append(cur)

        #normal case, where we print the netloc if it exists, path otherwise
        elif not trimmed_url.netloc:  
            trimmed_url = trimmed_url.path.split('/')[0]
            cur = [trimmed_url, label]
            full_set.append(cur)
        else:
            trimmed_url = trimmed_url.netloc
            cur = [trimmed_url, label]
            full_set.append(cur)
            
    return full_set


In [13]:
# Getting data into a single dataframe        
false_list = url_preprocess(false, 'false')
real_list = url_preprocess(real, 'real')

df = pd.DataFrame(false_list, columns = ['url', 'label'])
df2 = pd.DataFrame(real_list, columns = ['url', 'label'])
df = df.append(df2, ignore_index=True)

print(df)

                             url  label
0                  speedtalk.com  false
1              politics2020.info  false
2             www.nscdscamps.org  false
3                  howafrica.com  false
4          washingtonsources.org  false
5                      gloria.tv  false
6          blogs.trendolizer.com  false
7        www.religionmind.com:80  false
8        freedomcrossroads.us:80  false
9              beforeitsnews.com  false
10              yournewswire.com  false
11          politicspaper.com:80  false
12                dailyfeed.news  false
13                    jis.gov.jm  false
14                         me.me  false
15              therightists.com  false
16              yournewswire.com  false
17   rickeysmileymorningshow.com  false
18             www.breitbart.com  false
19              www.politico.com  false
20              www.mirror.co.uk  false
21                nrtonline.info  false
22         www.independent.co.uk  false
23     usaconservativereport.com  false


In [17]:
#get sample proportion
label_prob = df.groupby('label').size().div(len(df))
print(label_prob)

#new_prob = df.groupby(['url', 'label']).size().div(len(df)).div(label_prob, axis=0, level='label')
new_prob = df.groupby('url')['label'].value_counts() / df.groupby('url')['label'].count()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(new_prob)

label
false    0.430584
real     0.569416
dtype: float64
url                                      label
100percentfedup.com                      false    1.000000
24trueinfoamerica.com                    false    1.000000
abcnews.go.com                           real     1.000000
abcnews.go.com:80                        real     1.000000
action.citizen.org                       real     1.000000
action.getourtroopsout.com:80            real     1.000000
actionnetwork.org                        false    1.000000
actionnews3.com                          false    1.000000
activistmommy.com                        false    1.000000
afp.google.com:80                        real     1.000000
alexander.senate.gov                     real     1.000000
allnews4us.com                           false    1.000000
alternativemediasyndicate.com            false    1.000000
america.aljazeera.com                    real     1.000000
americaneedsmitt.com                     real     1.000000
americanfla

In [46]:
#Logistic regression
from sklearn.model_selection import train_test_split

#Train/Test/Validation splitting will be 64% Train, 20% Test, and 16% Validation
#first need to split into test and train
X = df['url']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#then split into validation and train 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(len(X_train))
print(len(X_test))
print(len(X_val))

797                         www.nytimes.com
574                          www.forbes.com
720                         www.youtube.com
417                      www.trainnews.info
340                          daily-vine.com
365                        ghanaweb.website
566                         edition.cnn.com
270                          empirenews.net
343                    conservativepost.com
907                    www.fedupthebook.com
375                             wazanews.tk
21                           nrtonline.info
58                           sportfella.com
878                        www.doe.mass.edu
246                             www.bbc.com
921                                  cq.com
593                  www.washingtonpost.com
67                 www.thepatriotreport.net
599                          thomas.loc.gov
496                   www.whitehouse.gov:80
478                           www.unhcr.org
71                         www.react365.com
295                     dailywor

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import preprocessing
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

logReg = LogisticRegression()
le = preprocessing.LabelEncoder()
X_train = le.fit_transform(X_train.tolist())
X_test = le.fit_transform(X_test.tolist())

X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
print(X_train)
#need to transform strings

logReg.fit(X_train, y_train)

prediction = logReg.predict(X_test)

print(classification_report(y_test, prediction))

Counter({'real': 360, 'false': 276})
Counter({'real': 120, 'false': 79})
[[318]
 [269]
 [391]
 [371]
 [ 38]
 [ 72]
 [ 54]
 [ 57]
 [ 35]
 [266]
 [204]
 [108]
 [157]
 [257]
 [224]
 [ 37]
 [385]
 [368]
 [172]
 [388]
 [375]
 [340]
 [ 46]
 [112]
 [170]
 [238]
 [387]
 [307]
 [396]
 [235]
 [309]
 [346]
 [324]
 [355]
 [248]
 [391]
 [318]
 [185]
 [228]
 [225]
 [ 92]
 [288]
 [ 87]
 [318]
 [331]
 [365]
 [ 34]
 [239]
 [260]
 [387]
 [366]
 [391]
 [318]
 [150]
 [221]
 [389]
 [391]
 [248]
 [176]
 [305]
 [314]
 [107]
 [391]
 [387]
 [ 95]
 [318]
 [ 66]
 [  0]
 [391]
 [198]
 [238]
 [332]
 [ 77]
 [305]
 [ 27]
 [162]
 [  0]
 [111]
 [199]
 [ 78]
 [361]
 [182]
 [178]
 [285]
 [227]
 [395]
 [151]
 [279]
 [201]
 [ 32]
 [158]
 [317]
 [311]
 [387]
 [  0]
 [220]
 [ 71]
 [ 84]
 [ 47]
 [ 68]
 [173]
 [ 96]
 [128]
 [349]
 [  0]
 [301]
 [ 58]
 [101]
 [181]
 [161]
 [324]
 [391]
 [385]
 [127]
 [116]
 [318]
 [222]
 [253]
 [193]
 [  0]
 [271]
 [ 90]
 [272]
 [379]
 [141]
 [353]
 [ 94]
 [362]
 [135]
 [275]
 [348]
 [ 95]
 [2

  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
