In [62]:
import numpy as np
import twitter, re, datetime, pandas as pd
import os
from dotenv import load_dotenv

In [63]:
from pathlib import Path
env_path = Path("../conf") / "app.env"
load_dotenv(dotenv_path=env_path)

True

In [64]:
twitter_keys = {
    'consumer_key':        os.getenv("CONSUMER_KEY"),
    'consumer_secret':     os.getenv("CONSUMER_SECRET"),
    'access_token_key':    os.getenv("ACCESS_TOKEN_KEY"),
    'access_token_secret': os.getenv("ACCESS_TOKEN_SECRET"),
}

api = twitter.Api(
    consumer_key         =   twitter_keys['consumer_key'],
    consumer_secret      =   twitter_keys['consumer_secret'],
    access_token_key     =   twitter_keys['access_token_key'],
    access_token_secret  =   twitter_keys['access_token_secret'],
    tweet_mode = 'extended'
)

In [65]:
print(api.VerifyCredentials().name)

Lukas Riesch


In [66]:
#TweetMiner function from Mike Roman

class TweetMiner(object):

    
    def __init__(self, api, result_limit = 20):
        
        self.api = api        
        self.result_limit = result_limit
        

    def mine_user_tweets(self, user="HillaryClinton", mine_retweets=False, max_pages=20):

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= max_pages:
            
            if last_tweet_id:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1, include_rts=mine_retweets)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, include_rts=mine_retweets)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   item['retweet_count'],
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                    }
                
                except:
                        mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   0,
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                    }
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data

In [67]:
miner = TweetMiner(api=api, result_limit=200)
data_trump = miner.mine_user_tweets(user="realDonaldTrump")
data_clinton = miner.mine_user_tweets(user="HillaryClinton")

In [68]:
df_trump = pd.DataFrame(data_trump)
df_clinton = pd.DataFrame(data_clinton)

In [69]:
print(type(df_clinton))
print(type(df_trump))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [70]:
df = pd.concat([df_trump, df_clinton], axis=0)

In [76]:
df.shape

(3934, 6)

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [80]:
vect = TfidfVectorizer(ngram_range=(2,5), stop_words="english")
summaries = "".join(df_trump["text"])
ngrams_summaries = vect.build_analyzer()(summaries)

Counter(ngrams_summaries).most_common(20)

[('impeachment hoax', 56),
 ('fake news', 50),
 ('mini mike', 48),
 ('united states', 39),
 ('kag2020 https', 36),
 ('republican party', 30),
 ('radical left', 28),
 ('total endorsement', 27),
 ('great state', 27),
 ('white house', 24),
 ('america great', 24),
 ('witch hunt', 24),
 ('great job', 23),
 ('mike bloomberg', 23),
 ('left democrats', 23),
 ('complete total', 22),
 ('complete total endorsement', 22),
 ('president trump', 21),
 ('radical left democrats', 21),
 ('mini mike bloomberg', 20)]

In [81]:
vect = TfidfVectorizer(ngram_range=(2,5), stop_words="english")
summaries = "".join(df_clinton["text"])
ngrams_summaries = vect.build_analyzer()(summaries)
Counter(ngrams_summaries).most_common(20)

[('donald trump', 98),
 ('https ttgeqxnqym', 75),
 ('vote https', 62),
 ('make sure', 61),
 ('hillary https', 60),
 ('health care', 50),
 ('https 3tkj4h68kz', 44),
 ('ttgeqxnqym https', 42),
 ('https ttgeqxnqym https', 42),
 ('human rights', 38),
 ('debatenight https', 38),
 ('3tkj4h68kz https', 32),
 ('https 3tkj4h68kz https', 32),
 ('climate change', 30),
 ('hillary clinton', 30),
 ('potus https', 29),
 ('today https', 28),
 ('trump https', 26),
 ('young people', 25),
 ('202 224', 25)]

In [87]:
from textacy.preprocess import preprocess_text

tweet_text = df["text"].values
clean_text = [preprocess_text(x, fix_unicode=True, lowercase=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True, no_accents=True) for x in tweet_text]

NotImplementedError: As of v0.7.0, :func:`fix_bad_unicode()` is no longer implemented in textacy. Instead, install and import ``ftfy`` directly, and call ``ftfy.fix_text(text)`` ,which is more extensive and customizable than textacy's wrapper of it.For details, check out https://ftfy.readthedocs.io.

In [103]:
import ftfy
tweet_text = df["text"].values
clean_text = [ftfy.fix_text(x) for x in tweet_text]
clean_text = [re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) for text in clean_text]

In [104]:
print(tweet_text[1:8])

['HYDROXYCHLOROQUINE &amp; AZITHROMYCIN, taken together, have a real chance to be one of the biggest game changers in the history of medicine. The FDA has moved mountains - Thank You! Hopefully they will BOTH (H works better with A, International Journal of Antimicrobial Agents).....'
 'Great story. Thank you to Mr. Young of Jonesboro, Arkansas! https://t.co/i9xH8VXfS2'
 'https://t.co/MLLfFTqv19' 'https://t.co/2wAUfZwBsa'
 'Today I spoke with our Nation’s Small Businesses, which employ nearly half of America’s workforce. We are taking the MOST aggressive action in history to deliver fast relief to your businesses and workers. We will always protect our Small Businesses! @SBAgov https://t.co/lf3y5iJ4hd'
 'https://t.co/ytgBP3hJv1' 'A Great Guy! https://t.co/L3sjMW9Inw']


In [105]:
print(clean_text[1:8])

['HYDROXYCHLOROQUINE & AZITHROMYCIN, taken together, have a real chance to be one of the biggest game changers in the history of medicine. The FDA has moved mountains - Thank You! Hopefully they will BOTH (H works better with A, International Journal of Antimicrobial Agents).....', 'Great story. Thank you to Mr. Young of Jonesboro, Arkansas! https://t.co/i9xH8VXfS2', '', '', "Today I spoke with our Nation's Small Businesses, which employ nearly half of America's workforce. We are taking the MOST aggressive action in history to deliver fast relief to your businesses and workers. We will always protect our Small Businesses! @SBAgov https://t.co/lf3y5iJ4hd", '', 'A Great Guy! https://t.co/L3sjMW9Inw']


In [109]:
y = df["handle"].map(lambda x: 1 if x=="realDonaldTrump" else 0).values
print(max(pd.Series(y).value_counts(normalize=True)))

0.6484494153533299


In [110]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

tfv = TfidfVectorizer(ngram_range=(2,4), max_features=2000)
X = tfv.fit_transform(clean_text).todense()
print(X.shape)

(3934, 2000)


In [112]:
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression()
params = {"penalty":["l1", "l2"], "C":np.logspace(-5,0,100)}
gs = GridSearchCV(lr, param_grid=params, cv=10, verbose=1)
gs.fit(X,y)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2000 out of 2000 | elapsed: 10.9min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.00000000e-05, 1....
       1.09749877e-01, 1.23284674e-01, 1.38488637e-01, 1.55567614e-01,
       1.74752840e-01, 1.96304065e-01, 2.20513074e-01, 2.47707636e-01,
       2.78255940e-01, 3.12571585e-01, 3.51119173e-01, 3.94420606e-01,
       4.43062146e-01, 4.97702356e-01, 5.59081018e-01, 6.28029144e-01,
       7

In [113]:
print(gs.best_params_)
print(gs.best_score_)

{'C': 0.8902150854450392, 'penalty': 'l2'}
0.83477351106289


In [114]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(LogisticRegression(), X, y, cv=10)

In [115]:
print(acc.mean())
print(1-y.mean())

0.83477351106289
0.64844941535333


In [116]:
estimator = LogisticRegression(penalty="l2", C=gs.best_params_["C"])
estimator.fit(X,y)

source_test = [
    "The presidency doesn’t change who you are—it reveals who you are. And we’ve seen all we need to of Donald Trump.",
    "Crooked Hillary is spending tremendous amounts of Wall Street money on false ads against me. She is a very dishonest person!"
]

Xtest = tfv.transform(source_test)
pd.DataFrame(estimator.predict_proba(Xtest), columns=["Proba_Hilary", "Proba_Trump"])

Unnamed: 0,Proba_Hilary,Proba_Trump
0,0.908105,0.091895
1,0.434411,0.565589


In [117]:
estimator.predict_proba(X)

array([[0.41386652, 0.58613348],
       [0.3552458 , 0.6447542 ],
       [0.26215435, 0.73784565],
       ...,
       [0.94597099, 0.05402901],
       [0.90364661, 0.09635339],
       [0.93593955, 0.06406045]])

In [123]:
probas_x = pd.DataFrame(estimator.predict_proba(X), columns=["Proba_Hillary", "Proba_Donald"])

In [124]:
joined_x = pd.merge(df, probas_x, left_index=True, right_index=True)

In [125]:
joined_x

Unnamed: 0,tweet_id,handle,retweet_count,text,mined_at,created_at,Proba_Hillary,Proba_Donald
0,1241367245143642113,realDonaldTrump,7453,....be put in use IMMEDIATELY. PEOPLE ARE DYIN...,2020-03-21 15:55:55.985052,Sat Mar 21 14:13:09 +0000 2020,0.413867,0.586133
0,1241014119572082694,HillaryClinton,20476,Where are the tests? https://t.co/VB9jeVwIfl,2020-03-21 15:56:04.063085,Fri Mar 20 14:49:57 +0000 2020,0.413867,0.586133
1,1241367239900778501,realDonaldTrump,19875,"HYDROXYCHLOROQUINE &amp; AZITHROMYCIN, taken t...",2020-03-21 15:55:55.985059,Sat Mar 21 14:13:08 +0000 2020,0.355246,0.644754
1,1241006585541013504,HillaryClinton,2700,Well said. It can't be easy to be a teenager r...,2020-03-21 15:56:04.063093,Fri Mar 20 14:20:01 +0000 2020,0.355246,0.644754
2,1241354162144784384,realDonaldTrump,5527,Great story. Thank you to Mr. Young of Jonesbo...,2020-03-21 15:55:55.985061,Sat Mar 21 13:21:10 +0000 2020,0.262154,0.737846
...,...,...,...,...,...,...,...,...
2546,780582940732424192,HillaryClinton,2186,"""It's about time this country had somebody run...",2020-03-21 15:56:11.387867,Tue Sep 27 01:40:50 +0000 2016,0.760796,0.239204
2547,780582541019475968,HillaryClinton,18327,"Paying zero in taxes doesn’t make you ""smart.""...",2020-03-21 15:56:11.387870,Tue Sep 27 01:39:14 +0000 2016,0.717995,0.282005
2548,780582086231126016,HillaryClinton,7634,No wonder Donald Trump is hiding his tax retur...,2020-03-21 15:56:11.387873,Tue Sep 27 01:37:26 +0000 2016,0.816262,0.183738
2549,780579514938261504,HillaryClinton,5243,Donald Trump may live in his own reality—but w...,2020-03-21 15:56:11.387876,Tue Sep 27 01:27:13 +0000 2016,0.525638,0.474362


In [126]:
joined_hillary = joined_x[joined_x['handle']=="HillaryClinton"]
for el in joined_hillary[joined_hillary['Proba_Hillary']==max(joined_hillary['Proba_Hillary'])]['text']:
    print(el)

"She did everything to make me feel like...I belonged in the White House. Can you imagine? From foster care to the White House." —Shane https://t.co/9cJs13GXRA


In [127]:
for el in joined_hillary[joined_hillary['Proba_Hillary']==min(joined_hillary['Proba_Hillary'])]['text']:
    print(el)

Today marks the start of open enrollment for 2018 health coverage. Go to https://t.co/LTQmZQt3jJ to find your perfect plan. #GetCoveredNow


In [128]:
joined_donald = joined_x[joined_x['handle']=="realDonaldTrump"]
for el in joined_donald[joined_donald['Proba_Donald']==max(joined_donald['Proba_Donald'])]['text']:
    print(el)

READ THE TRANSCRIPTS! The Impeachment Hoax is the greatest con job in the history of American politics! The Fake News Media, and their partner, the Democrat Party, are working overtime to make life for the United Republican Party, and all it stands for, as difficult as possible!


In [129]:
for el in joined_donald[joined_donald['Proba_Donald']==min(joined_donald['Proba_Donald'])]['text']:
    print(el)

“You go around Pennsylvania and you see Trump signs everywhere. The Donald Trump situation is bigger than the Reagan Revolution. Donald Trump has inspired us.” @RjHarris15  WHP580
