# Kaggle Dataset Source:

https://www.kaggle.com/kazanova/sentiment140#training.1600000.processed.noemoticon.csv

In [None]:
import pandas as pd
import json
import re
import spacy
import html
from nltk.corpus import stopwords
import string
%matplotlib inline
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [None]:
BASE = 'C:\\Users\\basharm\\PythonJupyter\\CoVID19CodeGit\\data\\sentiment_data\\'

In [None]:
def lemmatise(tweet):
    doc = nlp(tweet)
    tokens = []
    for token in doc:
        tokens.append(token.lemma_)
    return ' '.join(tokens)

tweet = u"Apple is looking at buying U.K. startup for $1 billion"
lemmatise(tweet)

In [None]:
re1 = re.compile(r'  +')
def fixup(x):
    """ Cleans up erroroneus characters"""
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ').replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').replace('rt @','@')
    return re1.sub(' ', html.unescape(x))

In [None]:
def preprocess_a_tweet(tweet):
    tweet = ' '.join([w for w in tweet.split() if not w in stop_words])
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = fixup(tweet)
    tweet = re.sub(r'^RT @\w+:', '', tweet).strip() # retweet unmodified
    tweet = re.sub(r'^MRT @\w+:', '', tweet).strip() # retweet modified
    tweet = re.sub(r'@\w+', '', tweet).strip() # @Person occurance
    tweet = re.sub(r'http\S+', '', tweet).strip() # url occurence
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet).strip() # fix repeating characters
    tweet = lemmatise(tweet)
    return tweet.lower()
    #return tweet
tweet = 'aaaaaa Canâ€™t wait to pop out of no where with this one ðŸ¤« untill then imma stay deep in this ðŸŽ’'
preprocess_a_tweet(tweet)

In [None]:
df_in = pd.read_csv(BASE+'training.1600000.processed.noemoticon.csv', 
                    encoding='ISO-8859-1', header = None)
df_in.head()

### The dataset has the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

In [None]:
df_in.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df_in.head()

In [None]:
for idx in df_in.index:
    df_in.at[idx,'text'] = preprocess_a_tweet(df_in.iloc[idx]['text'])
df_in.head()

In [None]:
df_in.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
df_in.head()

In [None]:
df_in.to_csv(BASE+'sentiment_tweets_pp.csv', encoding = 'utf8', index = None)

In [None]:
len(df_in)*.1

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_in, test_size=0.1)

In [None]:
df_train.to_csv(BASE+'train_pp.csv', encoding = 'utf8', index = None)

In [None]:
df_test.to_csv(BASE+'test_pp.csv', encoding = 'utf8', index = None)

In [None]:
## Plot
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
wordcloud = WordCloud().generate(re.sub("([^\x00-\x7F])+","",' '.join(list(df_in['text']))))
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()