# NLP/Sentiment Analysis

### Import Dependencies

In [69]:
from textblob import TextBlob
import nltk
import string
import pandas as pd
import random
from collections import Counter
import matplotlib.pyplot as plt

### Connecting with MongoDB

In [70]:
# Importing dependencies to import datasets from MongoDB
from pymongo import MongoClient
import os

In [71]:
# Creating a connection with MongoDB
client=MongoClient('localhost', 27017)

In [72]:
# Providing list of datasets for US Elections Twitter data
db=client.us_election_twitter
collect_names=db.list_collection_names()
collect_names

['romney3_12_df',
 'mccain1_08_df',
 'romney1_12_df',
 'barack1_08_df',
 'trump3_20_df',
 'trump2_20_df',
 'trump1_20_df',
 'trump1_16_df',
 'trump2_16_df',
 'hillary2_16_df',
 'romney2_12_df',
 'biden1_20_df',
 'biden2_20_df',
 'barack3_12_df',
 'biden3_20_df',
 'mccain3_08_df',
 'barack3_08_df',
 'mccain2_08_df',
 'hillary1_16_df',
 'barack2_12_df',
 'trump3_16_df',
 'barack1_12_df',
 'barack2_08_df',
 'hillary3_16_df']

### Importing Collections from MongoDB

In [91]:
data=db.biden3_20_df
h_list=data.find()
biden3_20=pd.DataFrame(list(data.find()))

In [92]:
data=db.trump3_20_df
h_list=data.find()
trump3_20=pd.DataFrame(list(data.find()))

### Random Samples of Datasets

In [93]:
# As this was a big data project we decided to choose random samples of 30 percent for each 
# dataset to facilitate quicker analysis and lower the probability of slow machine performance
biden3_20=biden3_20.sample(frac = 0.3)
trump3_20=trump3_20.sample(frac = 0.3)

### Preprocessing of Data

In [95]:
# Keeping the following columns: 'tweet', 'replies_count', 'retweets_count', 'like_count'
biden3_20=pd.DataFrame(biden3_20, columns=['tweet','replies_count','retweets_count','likes_count'])
trump3_20=pd.DataFrame(trump3_20, columns=['tweet','replies_count','retweets_count','likes_count'])
biden3_20['candidate']='Biden' # Adding separate column to identify candidate
trump3_20['candidate']='Trump' # Adding separate column to identify candidate

### Create Funciton to Clean Tweets

In [96]:
# Import Dependency for RegEx
import re
# import emoji 

def cleantweet(text):
    text=re.sub(r'@[A-Za-z0-9_]+', '', text) # This removes @ mentions
    text=re.sub(r'https?:\/\/\S+', '', text) # This removes the hyperlinks
    text=re.sub(r'#[A-Za-z0-9_]+', '', text) # This removes the hashtag mentions
    text=re.sub(r'\W', ' ', text) # This removes all special characters
    text=re.sub(r'[^\x00-\x7F]+',' ', text) # This replaces non-ASCII characters with space
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Removing all single characters left as a result of removing all special characters
    text=re.sub(r'\^[a-zA-Z]\s+', ' ', text) # Removing all single characters from the start
    text=re.sub(r'_', '', text) # This removes underscore symbols
    text=re.sub(r'\s+', ' ', text, flags=re.I) # Replacing multiple spaces with single spaces
#     text = text.lower() # Converts all text to lowercase
#     text=emoji.get_emoji_regexp().sub(u'',text) # This removes the emojis
#     text=re.sub('\n', ' ', text) # This removes linebreaks
    return text

### Creating Function to Drop Empty Tweets

In [97]:
import numpy as np
def drop_empty_tweets(db):
    db=db.replace(r'^\s*$', np.nan, regex=True) # Replace empty cells with NaN value
    return db

### Cleaning tweets, dropping rows with NaN values and Creating New Column

In [98]:
# Applying function to clean tweets and drop empty tweets for Biden
biden3_20['tweet']=biden3_20['tweet'].apply(cleantweet)
biden3_20=biden3_20.apply(drop_empty_tweets)

In [99]:
# Applying function to clean tweets and drop empty tweets for Trump
trump3_20['tweet']=trump3_20['tweet'].apply(cleantweet)
trump3_20=trump3_20.apply(drop_empty_tweets)

### Dropping NaN values

In [100]:
biden3=biden3_20.dropna()
trump3=trump3_20.dropna()

### Creating Length of Tweets Feature

In [103]:
biden3['tweet'] = biden3['tweet'].astype(str) # Converting tweet column to string before splitting
biden3['tweet_length']= biden3['tweet'].str.split().str.len() # Creating separate column with length of tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [104]:
trump3['tweet'] = trump3['tweet'].astype(str) # Converting tweet column to string before splitting
trump3['tweet_length']= trump3['tweet'].str.split().str.len() # Creating separate column with length of tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Joining both Biden and Trump Datasets

In [105]:
# Joining both Biden and Trump Datasets
third_debate=pd.concat([biden3,trump3])
third_debate.count()

tweet             255601
replies_count     255601
retweets_count    255601
likes_count       255601
candidate         255601
tweet_length      255601
dtype: int64

In [106]:
# Create function to obtain subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create function to obtain polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

# Create Columns for Subjectivity and Polarity
third_debate['Subjectivity']=third_debate['tweet'].apply(getSubjectivity)
third_debate['Polarity']=third_debate['tweet'].apply(getPolarity)
third_debate.head(60)

Unnamed: 0,tweet,replies_count,retweets_count,likes_count,candidate,tweet_length,Subjectivity,Polarity
132974,So why was trump elected,0,0,0,Biden,5,0.0,0.0
190459,Oklahoma early voting is October 29 30 and 31...,0,0,0,Biden,15,0.3,0.1
22717,Open the state,0,0,0,Biden,3,0.5,0.0
90782,if you haven noticed more than half the count...,0,1,0,Biden,20,0.222222,0.111111
215578,It came down to muting the president That what...,0,1,1,Biden,40,0.489815,0.028241
212650,Boom careful what you wish for Joe Biden,0,0,0,Biden,8,1.0,-0.1
155792,Joe Biden what are you talking to yourself at...,0,0,0,Biden,47,0.9,-0.575
263225,When called Lincoln racist and said that he w...,0,0,0,Biden,27,0.288889,-0.155556
29375,got my ass kicked,0,0,0,Biden,4,0.0,0.0
211662,Alyssa m so disappointed in You was always hu...,0,0,0,Biden,30,0.54,-0.04


In [107]:
# Obtaining Polarity Analysis
def getPolarityAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
third_debate['Sentiment']=third_debate['Polarity'].apply(getPolarityAnalysis)
third_debate.head(60)

Unnamed: 0,tweet,replies_count,retweets_count,likes_count,candidate,tweet_length,Subjectivity,Polarity,Sentiment
132974,So why was trump elected,0,0,0,Biden,5,0.0,0.0,Neutral
190459,Oklahoma early voting is October 29 30 and 31...,0,0,0,Biden,15,0.3,0.1,Positive
22717,Open the state,0,0,0,Biden,3,0.5,0.0,Neutral
90782,if you haven noticed more than half the count...,0,1,0,Biden,20,0.222222,0.111111,Positive
215578,It came down to muting the president That what...,0,1,1,Biden,40,0.489815,0.028241,Positive
212650,Boom careful what you wish for Joe Biden,0,0,0,Biden,8,1.0,-0.1,Negative
155792,Joe Biden what are you talking to yourself at...,0,0,0,Biden,47,0.9,-0.575,Negative
263225,When called Lincoln racist and said that he w...,0,0,0,Biden,27,0.288889,-0.155556,Negative
29375,got my ass kicked,0,0,0,Biden,4,0.0,0.0,Neutral
211662,Alyssa m so disappointed in You was always hu...,0,0,0,Biden,30,0.54,-0.04,Negative


### Convert ML_Data to CSV

In [108]:
third_debate.to_csv(r'C:\Users\Greg\Documents\Analysis_Projects\Final_Project\Concat_ML_Data\third_debate.csv', index = False, header=True)