# Natural Language Processing and Modeling Notebook

## First off let's import packages we plan to use

In [177]:
import re
import sys
import json
import spacy
import string
import gensim
from tqdm import tqdm
import warnings
import imblearn
import requests
import twitter
import numpy as np
import xgboost as xgb
import pandas as pd
from scipy import stats 
import seaborn as sns
import texthero as hr
from PIL import Image 
from wordcloud import WordCloud
from nltk import pos_tag, FreqDist
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.manifold import TSNE
from collections import defaultdict
from mpl_toolkits.mplot3d import Axes3D
from nltk.tokenize import TweetTokenizer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier
from matplotlib.ticker import MaxNLocator
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction import text
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from nltk.stem import WordNetLemmatizer, PorterStemmer
from imblearn.metrics import classification_report_imbalanced
from nltk.tokenize import word_tokenize, regexp_tokenize, RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, confusion_matrix, classification_report, accuracy_score, precision_score
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")
%matplotlib inline

## We want to start by making the API calls necessary to aggregate the data we need and concatenate to a single dataframe.

In [178]:
#This call function allowed me to aggregate all the data I could from the Twitter API

def tweet_call(token):
    url = "https://api.twitter.com/2/tweets/search/recent?query=maskmandate&tweet.fields=geo,created_at,lang,possibly_sensitive,referenced_tweets,source&place.fields=country&max_results=100"

    payload={}
    headers = {
    'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAA87TQEAAAAAshYtyDyl0fN52qZbLOLUku%2Fln%2Bg%3DUNzKuiAu5XmoNCCHtl0403jh2wepuXWQGWLKL8o8ZjnJodU8b6',
    'Cookie': 'guest_id=v1%3A163060937974157763; personalization_id="v1_fPNQ4agp422tRlmCgiVxHA=="'}

    response = requests.request("GET", url, headers=headers, data=payload)
    next_token = response.json()['meta']['next_token']
    #print(response.json)

### This pair of for loops opens files, saves them as dataframs and then concatenates the results to a csv.

In [179]:
def aggregate_data(file):
    data = json.load(open(file))
    new_df = pd.DataFrame(data['data'])
    return new_df

In [180]:
results_list = []

for file in range(1,48):
    results_list.append(aggregate_data(f't{file}.json'))
    
results_df = pd.concat(results_list, ignore_index=True)

In [181]:
def cleaner(csv_filepath):
    header_list = ['ID', 'Referenced Type', 'Tweet', 'Sensitive', 'Source','Date', 'Language']
    org_df = pd.read_csv(csv_filepath, index_col=0, names = header_list)
    org_df.reset_index(inplace = True, drop = True)
    return org_df 

In [182]:
results_df.to_csv('tweets_df', mode='a', header=False)

### Let's explore new datatypes, create other features, eliminate copies in our data and filter to only unique tweet values.

In [183]:
results_df['Created'] = pd.to_datetime(results_df['created_at'])
results.columns()

In [184]:
results_df.text.duplicated().value_counts()

False    2587
True     1599
Name: text, dtype: int64

In [185]:
results_df.loc[results_df.text.duplicated(), :]

Unnamed: 0,possibly_sensitive,created_at,id,source,lang,referenced_tweets,text,geo,Created
20,False,2021-09-09T09:35:14.000Z,1435899590113325056,Twitter for iPhone,en,"[{'type': 'retweeted', 'id': '1433033391805538308'}]",RT @diecastryan: A classic ATL lineup. #delta #deltaairlines #airplane #aviation #avgeek #nikon #nikonphotography #atlanta #atlairport #tra…,,2021-09-09 09:35:14+00:00
21,False,2021-09-09T08:57:24.000Z,1435890069089312774,Twitter for Android,en,"[{'type': 'retweeted', 'id': '1433033391805538308'}]",RT @diecastryan: A classic ATL lineup. #delta #deltaairlines #airplane #aviation #avgeek #nikon #nikonphotography #atlanta #atlairport #tra…,,2021-09-09 08:57:24+00:00
22,False,2021-09-09T08:55:25.000Z,1435889570155950081,Twitter for Android,en,"[{'type': 'retweeted', 'id': '1433033391805538308'}]",RT @diecastryan: A classic ATL lineup. #delta #deltaairlines #airplane #aviation #avgeek #nikon #nikonphotography #atlanta #atlairport #tra…,,2021-09-09 08:55:25+00:00
33,False,2021-09-08T21:52:13.000Z,1435722670163775488,Twitter for iPhone,en,"[{'type': 'retweeted', 'id': '1435247630838288384'}]",RT @diecastryan: Who else is ready for fall?! #delta #deltaairlines #boston #fallcolors #boeing #aerialphotography #avgeek #airplane #aviat…,,2021-09-08 21:52:13+00:00
39,False,2021-09-08T18:29:44.000Z,1435671711538626560,Twitter for iPhone,en,"[{'type': 'retweeted', 'id': '1435640216384049153'}]",RT @omadnp: #DeltaAirlines - gate agent at #bna refused to enforce mask mandate in airport! Come on delta - you should be better than that!,,2021-09-08 18:29:44+00:00
...,...,...,...,...,...,...,...,...,...
4170,False,2021-09-02T19:44:06.000Z,1433516099786661892,Twitter for Android,en,"[{'type': 'retweeted', 'id': '1433268482527809536'}]",RT @theoztrucker: I was on a plane yesterday. \n\nMask all the way and social distance through the terminal.\n\nSit side by side on a packed pl…,,2021-09-02 19:44:06+00:00
4175,False,2021-09-02T19:31:19.000Z,1433512882642292738,Twitter Web App,en,"[{'type': 'retweeted', 'id': '1433345572405186562'}]",RT @ShentonStage: First time on a plane since 2019! @VirginAtlantic announce that mask wearing on the flight is MANDATORY as a condition of…,,2021-09-02 19:31:19+00:00
4178,False,2021-09-02T19:10:14.000Z,1433507576889544717,Twitter for iPhone,en,"[{'type': 'retweeted', 'id': '1433268482527809536'}]",RT @theoztrucker: I was on a plane yesterday. \n\nMask all the way and social distance through the terminal.\n\nSit side by side on a packed pl…,,2021-09-02 19:10:14+00:00
4181,False,2021-09-02T18:51:41.000Z,1433502907601588234,Twitter for Android,en,"[{'type': 'retweeted', 'id': '1433268482527809536'}]",RT @theoztrucker: I was on a plane yesterday. \n\nMask all the way and social distance through the terminal.\n\nSit side by side on a packed pl…,,2021-09-02 18:51:41+00:00


In [186]:
unique_df=results_df.drop_duplicates(subset='text', keep='first').copy()

In [187]:
unique_df.source.value_counts()

Twitter for iPhone               917
Twitter Web App                  620
Twitter for Android              486
Raspberry Pi - ADS-B Receiver    170
Twitter for iPad                  75
                                ... 
Quant Data (Production)            1
B9                                 1
SNAP101                            1
The Domain Bot                     1
mrproverbe                         1
Name: source, Length: 90, dtype: int64

In [189]:
unique_df=unique_df.fillna(0)

In [192]:
unique_df=unique_df.drop(unique_df['lang'].loc[unique_df['lang']!='en'].index)
unique_df=unique_df.drop(unique_df['source'].loc[unique_df['lang']=='Raspberry Pi - ADS-B Receiver'].index)

In [201]:
unique_df.loc[unique_df['geo']!= 0]

Unnamed: 0,possibly_sensitive,created_at,id,source,lang,referenced_tweets,text,geo,Created
12,False,2021-09-09T13:31:38.000Z,1435959083434119172,Twitter for iPhone,en,0,"A throwback to only last year, but it feels like an eternity ago. \n\nLast flight into Las Vegas in March 2020. Next flight into Las Vegas - this Sunday, for a trade show.\n#throwbackthursday #bu...",{'place_id': '7142eb97ae21e839'},2021-09-09 13:31:38+00:00
17,False,2021-09-09T11:46:02.000Z,1435932505190903815,Instagram,en,0,Columbus here we come! GO DUCKS!!! #oregonohiostate #cfb #ncaa #travel #goducks #flyhigh #deltaairlines @ Portland International Airport https://t.co/qfk1Jy0g9r,"{'place_id': 'ac88a4f17a51c7fc', 'coordinates': {'type': 'Point', 'coordinates': [-122.59622222, 45.58886979]}}",2021-09-09 11:46:02+00:00
29,False,2021-09-09T03:41:01.000Z,1435810449929424899,Instagram,en,0,@detroitwick @tripwipes #tripmitt got us covered! Thanks @a__smith for the hookup. #deltaairlines #laguardiaairport @ LaGuardia Airport https://t.co/gCiTYjxRFj,"{'place_id': '00c39537733fa112', 'coordinates': {'type': 'Point', 'coordinates': [-73.87288343, 40.77195338]}}",2021-09-09 03:41:01+00:00
40,False,2021-09-08T18:14:29.000Z,1435667876065824777,Twitter for iPhone,en,"[{'type': 'quoted', 'id': '1435611385828884486'}]",Whichever will provide more award travel. #DeltaAirlines https://t.co/fb2jvhvDn9,{'place_id': '019e0ca71523f3f0'},2021-09-08 18:14:29+00:00
93,False,2021-09-08T00:06:08.000Z,1435393983304306693,Twitter for iPhone,en,0,Hey @Delta typically love u guys but wtf kinda UPGRADE is this? Literally sitting IN THE AISLE. Most uncomfortable flight of my life. Would rather churn butter w/a colonial woman on the wing. #del...,{'place_id': '0fc7e0a406d55000'},2021-09-08 00:06:08+00:00
127,False,2021-09-07T16:35:57.000Z,1435280689059008513,Twitter for iPhone,en,"[{'type': 'replied_to', 'id': '1435278998318170114'}]",Good afternoon #interviews #deltaairlines #TaylorSwift #わんすたぐらむ #むくみ解消 #blacksabbathsabotage #とnyのキャップ #vaporwave #kitchenstadium #ｖａｐｏｒｗａｖｅ #ぷりんと倶楽部 #cutiesofinstagram #派手ネイル #williamshakespeare ...,{'place_id': 'b39bbbbd69b97fc0'},2021-09-07 16:35:57+00:00
191,False,2021-09-06T19:48:14.000Z,1434966694418653188,Instagram,en,0,I Iove technology and electronics. #socool #deltaairlines @ Hartsfield-Jackson Atlanta International Airport https://t.co/nOG4y277RX,"{'coordinates': {'type': 'Point', 'coordinates': [-84.41978006, 33.63975444]}, 'place_id': '7142eb97ae21e839'}",2021-09-06 19:48:14+00:00
201,False,2021-09-06T17:19:07.000Z,1434929163937271808,Twitter for iPhone,en,"[{'type': 'replied_to', 'id': '1434927041363824645'}]",@SouthwestAir Yo DJ Alex Reyes #djalexreyes wants to say that Southwest Airlines is the best airline alongside with @Delta #deltaairlines #iflysouthwest #heartone #iflydelta,{'place_id': '30344aecffe6a491'},2021-09-06 17:19:07+00:00
203,False,2021-09-06T16:22:24.000Z,1434914893216485376,Twitter for iPhone,en,0,It really is marvellous to photograph a plane like this @Delta Boeing 767 above something as naturally beautiful as the Alaskan Wilderness! \n\n#aviation #aerialphotography #aviationphotography #b...,{'place_id': '7a863bb88e5bb33c'},2021-09-06 16:22:24+00:00
211,False,2021-09-06T01:55:57.000Z,1434696844928438278,Instagram,en,0,"New Orleans. ✈️ \n\n#neworleans #luisiana #yourshotphotographer #airview #shotoniphone #vsco #vscocam #imaginarymagnitude #deltaairlines #expedia #expediapic @ New Orleans, Louisiana https://t.co...","{'coordinates': {'type': 'Point', 'coordinates': [-90.070864, 29.948479]}, 'place_id': 'dd3b100831dd1763'}",2021-09-06 01:55:57+00:00


### Doesn't seem that geo is all that helpful but since we have filtered on englidh tweets and removed the Raspberry Pi automated tweets now we can shed those columns and begin preprocessing.

In [24]:
#Running through an API for sentiment analysis

url = "https://text-analysis12.p.rapidapi.com/sentiment-analysis/api/v1.1"

payload = "{\"language\": \"english\", \"text\": \"I want less taxes.\"}"

headers = {
    'content-type': "application/json",
    'x-rapidapi-host': "text-analysis12.p.rapidapi.com",
    'x-rapidapi-key': "f8d244aeb1msha67e6edebf2951dp1304d6jsn215788b5fa29"
    }

response = requests.request("POST", url, data=payload, headers=headers)

print(response.json)

<bound method Response.json of <Response [200]>>


In [26]:
response.text

'{"time_taken":0.007585048675537109,"msg":"Sentiment Analysis successful","ok":true,"aggregate_sentiment":{"neg":0.0,"neu":0.606,"pos":0.394,"compound":0.0772},"sentiment_list":[{"neg":0.0,"neu":0.606,"pos":0.394,"compound":0.0772,"sentence":"I want less taxes."}],"sentiment":"positive"}'

In [None]:
new_stopwords = ["sxsw", "android", "google", "apple", "ipad", "app", "austin", 'iphone']
punctuations = string.punctuation
stopwords.extend(new_stopwords)
stopwords.extend(punctuations)