In [187]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from datetime import datetime, timedelta
 
import seaborn as sns
import re
import nltk

from sklearn.metrics import accuracy_score,roc_auc_score, f1_score, recall_score
from sklearn.metrics import  roc_curve, confusion_matrix, precision_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()


from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

import mysql.connector

import warnings
warnings.filterwarnings('ignore') 
%matplotlib inline

In [188]:
# increasing max length for all columns and number of columns
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_columns", 50)

pd.set_option('display.max_info_columns', 500)
pd.set_option('display.max_rows', 500)

In [189]:
db = mysql.connector.connect(
    host="localhost",
    user="root",
    passwd="password",
    database="twitter_2",
    charset = 'utf8'
 )


In [190]:
# check that it's connected
mycursor = db.cursor()
mycursor.execute("SHOW TABLES")

for x in mycursor:
    print(x)
    

('twitter_2',)


In [191]:
time_now = datetime.now()
time_now.strftime('%Y-%m-%d %H:%M:%S')

'2020-05-16 16:57:01'

In [192]:
time_10mins_before = timedelta(hours=0,minutes=400)
time_10mins_before

datetime.timedelta(seconds=24000)

In [193]:
time_interval = time_now - time_10mins_before
time_interval.strftime('%Y-%m-%d %H:%M:%S')

'2020-05-16 10:17:01'

In [194]:
# load data in the last 1 hour
query = "SELECT id_str, created_at, text, processed FROM twitter_2 WHERE created_at >= '{}'"     \
        .format(time_interval)

df = pd.read_sql(query, con=db)

In [195]:
# UTC for date time at default
df['created_at'] = pd.to_datetime(df['created_at'])

In [196]:
df

Unnamed: 0,id_str,created_at,text,processed
0,1261644239621980162,2020-05-16 13:06:41,"Have confidence in the young ppl. Give us a chance, and we'll surprise you. \n#coronavirus #StayHome #youngblood",-1
1,1261644240611938304,2020-05-16 13:06:41,"RT @TeamTrump: President @realDonaldTrump to NYPD officer who recovered from #Coronavirus: Youre my hero."" https://t.co/3bRiDuUfN6",-1
2,1261644243174666241,2020-05-16 13:06:42,"RT @TomFitton: VOTER FRAUD: Tom Fitton: Mail-In Voting Can Be a ""Recipe for CHAOS"" on Election Day! PLUS The soft totalitarianism of the a",-1
3,1261644251802238977,2020-05-16 13:06:44,"RT @LiarTable: #China #WorldHealthOrganization, #coronavirus #Dobbs #WuhanFlu @LouDobbs .@WHO #Canada #TrumpIsALaughingStock .@realDonaldT",-1
4,1261644252657922048,2020-05-16 13:06:44,"RT @TeamTrump: President @realDonaldTrump to NYPD officer who recovered from #Coronavirus: Youre my hero."" https://t.co/3bRiDuUfN6",-1
5,1261644253463281665,2020-05-16 13:06:44,RT @Sostenedor: Florida is opening May 18. Consider @CaseyAskar for US Congress FL-19 to help Governor DeSantis and President Trump #MAGA2,-1
6,1261644254813646854,2020-05-16 13:06:45,Confirmed cases of #coronavirus in #Srinagar on Saturday : 03\n\n- #Khankah Movla - 01\n- #Kakasarai - 01\n- #Soura - 01,-1
7,1261644254712979457,2020-05-16 13:06:45,Fourth set of initiative announced by Finance Minister Nirmala Sitharaman. Today FM announced 11 major boosters foc https://t.co/40FU0jAS0A,-1
8,1261644257078784000,2020-05-16 13:06:45,RT @jamesclossick: This grim reapers hypocrisy is both unlimited and transparent. https://t.co/VwCC82Uv83,-1
9,1261644257800196097,2020-05-16 13:06:45,https://t.co/rkdMTUYUrP\n#Lockdown \n#ChineseVirus \n#coronavirus \n#Covid_19 \n#StayHome \nThis guy is so right\n#IwantToGoToWork,-1


In [197]:
def clean_text(string):
    """Function makes the text columns similar 
       to preprocessing used for training model"""
    #removes retweets with http     
    new_str = re.sub(r"http://t(?!$)", '', string)
    #removes retweets with https
    new_str = re.sub(r"https?://[A-Za-z0-9./]*", '',new_str)
    #removes ashtags followed by numbers     
    new_str = re.sub(r"#[0-9]", '', new_str)
    # removes handles     
    new_str = re.sub(r"@[\w]*", '', new_str)
    new_str = re.sub(r"[^a-zA-Z'#]", ' ', new_str)
    return new_str

In [198]:
df['text'] = df['text'].apply(clean_text)

In [199]:
# also getting rid of all words shorter than 3 characters
df['tweet_more_2_charac'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))

In [200]:
df

Unnamed: 0,id_str,created_at,text,processed,tweet_more_2_charac
0,1261644239621980162,2020-05-16 13:06:41,Have confidence in the young ppl Give us a chance and we'll surprise you #coronavirus #StayHome #youngblood,-1,Have confidence the young ppl Give chance and we'll surprise you #coronavirus #StayHome #youngblood
1,1261644240611938304,2020-05-16 13:06:41,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero
2,1261644243174666241,2020-05-16 13:06:42,RT VOTER FRAUD Tom Fitton Mail In Voting Can Be a Recipe for CHAOS on Election Day PLUS The soft totalitarianism of the a,-1,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the
3,1261644251802238977,2020-05-16 13:06:44,RT #China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock,-1,#China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock
4,1261644252657922048,2020-05-16 13:06:44,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero
5,1261644253463281665,2020-05-16 13:06:44,RT Florida is opening May Consider for US Congress FL to help Governor DeSantis and President Trump #MAGA,-1,Florida opening May Consider for Congress help Governor DeSantis and President Trump #MAGA
6,1261644254813646854,2020-05-16 13:06:45,Confirmed cases of #coronavirus in #Srinagar on Saturday #Khankah Movla #Kakasarai #Soura,-1,Confirmed cases #coronavirus #Srinagar Saturday #Khankah Movla #Kakasarai #Soura
7,1261644254712979457,2020-05-16 13:06:45,Fourth set of initiative announced by Finance Minister Nirmala Sitharaman Today FM announced major boosters foc,-1,Fourth set initiative announced Finance Minister Nirmala Sitharaman Today announced major boosters foc
8,1261644257078784000,2020-05-16 13:06:45,RT This grim reapers hypocrisy is both unlimited and transparent,-1,This grim reapers hypocrisy both unlimited and transparent
9,1261644257800196097,2020-05-16 13:06:45,#Lockdown #ChineseVirus #coronavirus #Covid #StayHome This guy is so right #IwantToGoToWork,-1,#Lockdown #ChineseVirus #coronavirus #Covid #StayHome This guy right #IwantToGoToWork


# Lemmatization with POS

In [201]:
# defining different POS
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# function to lemmatize all words in a sentence
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
            # 'ass' kept being reduced to 'as' for some reason         
        if word == 'ass':
            lemmatized_sentence.append(word)
        
        elif tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


In [202]:
df['lemmatized'] = df['tweet_more_2_charac'].apply(lambda x: lemmatize_sentence(x))

In [203]:
df

Unnamed: 0,id_str,created_at,text,processed,tweet_more_2_charac,lemmatized
0,1261644239621980162,2020-05-16 13:06:41,Have confidence in the young ppl Give us a chance and we'll surprise you #coronavirus #StayHome #youngblood,-1,Have confidence the young ppl Give chance and we'll surprise you #coronavirus #StayHome #youngblood,Have confidence the young ppl Give chance and we 'll surprise you # coronavirus # StayHome # youngblood
1,1261644240611938304,2020-05-16 13:06:41,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero,President NYPD officer who recover from # Coronavirus Youre hero
2,1261644243174666241,2020-05-16 13:06:42,RT VOTER FRAUD Tom Fitton Mail In Voting Can Be a Recipe for CHAOS on Election Day PLUS The soft totalitarianism of the a,-1,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the
3,1261644251802238977,2020-05-16 13:06:44,RT #China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock,-1,#China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock,# China # WorldHealthOrganization # coronavirus # Dobbs # WuhanFlu # Canada # TrumpIsALaughingStock
4,1261644252657922048,2020-05-16 13:06:44,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero,President NYPD officer who recover from # Coronavirus Youre hero
5,1261644253463281665,2020-05-16 13:06:44,RT Florida is opening May Consider for US Congress FL to help Governor DeSantis and President Trump #MAGA,-1,Florida opening May Consider for Congress help Governor DeSantis and President Trump #MAGA,Florida opening May Consider for Congress help Governor DeSantis and President Trump # MAGA
6,1261644254813646854,2020-05-16 13:06:45,Confirmed cases of #coronavirus in #Srinagar on Saturday #Khankah Movla #Kakasarai #Soura,-1,Confirmed cases #coronavirus #Srinagar Saturday #Khankah Movla #Kakasarai #Soura,Confirmed case # coronavirus # Srinagar Saturday # Khankah Movla # Kakasarai # Soura
7,1261644254712979457,2020-05-16 13:06:45,Fourth set of initiative announced by Finance Minister Nirmala Sitharaman Today FM announced major boosters foc,-1,Fourth set initiative announced Finance Minister Nirmala Sitharaman Today announced major boosters foc,Fourth set initiative announce Finance Minister Nirmala Sitharaman Today announce major booster foc
8,1261644257078784000,2020-05-16 13:06:45,RT This grim reapers hypocrisy is both unlimited and transparent,-1,This grim reapers hypocrisy both unlimited and transparent,This grim reaper hypocrisy both unlimited and transparent
9,1261644257800196097,2020-05-16 13:06:45,#Lockdown #ChineseVirus #coronavirus #Covid #StayHome This guy is so right #IwantToGoToWork,-1,#Lockdown #ChineseVirus #coronavirus #Covid #StayHome This guy right #IwantToGoToWork,# Lockdown # ChineseVirus # coronavirus # Covid # StayHome This guy right # IwantToGoToWork


In [206]:
# getting rid of the ashtags
df['lemmatized_no_ash'] = df['lemmatized'].str.replace('# ', '')

In [207]:
df.head(5)

Unnamed: 0,id_str,created_at,text,processed,tweet_more_2_charac,lemmatized,lemmatized_no_ash
0,1261644239621980162,2020-05-16 13:06:41,Have confidence in the young ppl Give us a chance and we'll surprise you #coronavirus #StayHome #youngblood,-1,Have confidence the young ppl Give chance and we'll surprise you #coronavirus #StayHome #youngblood,Have confidence the young ppl Give chance and we 'll surprise you # coronavirus # StayHome # youngblood,Have confidence the young ppl Give chance and we 'll surprise you coronavirus StayHome youngblood
1,1261644240611938304,2020-05-16 13:06:41,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero,President NYPD officer who recover from # Coronavirus Youre hero,President NYPD officer who recover from Coronavirus Youre hero
2,1261644243174666241,2020-05-16 13:06:42,RT VOTER FRAUD Tom Fitton Mail In Voting Can Be a Recipe for CHAOS on Election Day PLUS The soft totalitarianism of the a,-1,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the,VOTER FRAUD Tom Fitton Mail Voting Can Recipe for CHAOS Election Day PLUS The soft totalitarianism the
3,1261644251802238977,2020-05-16 13:06:44,RT #China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock,-1,#China #WorldHealthOrganization #coronavirus #Dobbs #WuhanFlu #Canada #TrumpIsALaughingStock,# China # WorldHealthOrganization # coronavirus # Dobbs # WuhanFlu # Canada # TrumpIsALaughingStock,China WorldHealthOrganization coronavirus Dobbs WuhanFlu Canada TrumpIsALaughingStock
4,1261644252657922048,2020-05-16 13:06:44,RT President to NYPD officer who recovered from #Coronavirus Youre my hero,-1,President NYPD officer who recovered from #Coronavirus Youre hero,President NYPD officer who recover from # Coronavirus Youre hero,President NYPD officer who recover from Coronavirus Youre hero


# Vectorizing TF-IDF for Fresh Tweets

In [209]:
# instantiate the tf-idf object
tfidf_vectors = TfidfVectorizer(max_df=0.90, min_df=2, max_features=9000, 
                                stop_words='english',
                                ngram_range=(1, 3),
                                lowercase= True)



In [210]:
# TF-IDF feature matrix
tfidf = tfidf_vectors.fit_transform(df['lemmatized_no_ash'])