# Tweet Sentiment NLP

In [1]:
# check if modules are installed
from subprocess import Popen, PIPE

output = Popen("pip list | awk '{print $1}'", shell = True, stdout=PIPE).stdout.read().split()
packages = [x.decode('utf-8') for x in output][2:]
modules = ['contractions', 'demoji', 'kaggle', 'pandas']
for nm in modules:
    if nm not in packages:
        ! pip install {nm}

In [2]:
# importing modules
import contractions
import csv
import demoji
import glob
import pandas as pd
import re
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi

In [3]:
# downloading emoji data
demoji.download_codes()

Downloading emoji data ...
... OK (Got response in 0.25 seconds)
Writing emoji data to /home/parkernisbet/.demoji/codes.json ...
... OK


In [4]:
# downloading dataset
api = KaggleApi()
api.authenticate()
dataset = 'kazanova/sentiment140'
csv_name = 'training.1600000.processed.noemoticon.csv'
try:
    os.remove(csv_name)
except:
    pass
api.dataset_download_file(dataset, file_name=csv_name, path='./')
fn = glob.glob('train*.zip', recursive = True)[0]
with zipfile.ZipFile(fn) as zip_file:
    for file in zip_file.namelist():
        if file == csv_name:
            zip_file.extract(csv_name)
os.remove(fn)

In [5]:
# importing data
columns = ['target', 'text']
df_data = pd.read_csv(csv_name, usecols = [0, 5], header = None, names = columns)
print(f'df_data dimensions: {df_data.shape}')
df_data.head()

df_data dimensions: (1600000, 2)


Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [6]:
# importing emoticon descriptions
df_repl = pd.read_csv('emoticon_descriptions.csv', header = 0, usecols =[0, 1], names = ['emoticon', 'description'])
print(f'df_repl dimensions: {df_repl.shape}')
df_repl.head()

df_repl dimensions: (145, 2)


Unnamed: 0,emoticon,description
0,:?),smile
1,:),smile
2,:-],smile
3,:],smile
4,:-3,smile


In [7]:
# moving to dictionary
dict_emot = {a:b for a, b in zip(df_repl.iloc[:, 0], df_repl.iloc[:, 1])}

In [8]:
# adding html replacements
dict_emot['&quot;'] = 'quote'
dict_emot['&amp;'] = 'and'
dict_emot['&lt;'] = 'less than'
dict_emot['&gt;'] = 'greater than'

In [9]:
# creating dictionary of common abbreviations
df_repl = pd.read_csv('common_abbreviations.csv').applymap(lambda x: x.lower())
dict_abbr = {a:b for a, b in zip(df_repl.iloc[:, 0], df_repl.iloc[:, 1])}

In [10]:
# preprocessing tweet body text
def preprocess(text):
    '''
    Returns a preprocessed version of the passed string.

        Parameters:
            text (str) : passed string
        
        Returns:
            mod_text (str) : preprocessed string
    '''

    # replace usernames
    mod_text = re.sub(r'(?:(?<=\s)|(?<=^))(@\S+)(?:(?=\s)|(?=$))', \
        ' USER ', text)
    
    # replace urls
    mod_text = re.sub(r'(?:(?<=^)|(?<=\s))(https?:\/\/\S+)(?:(?=$)|(?=\s))', \
        ' URL ', mod_text)
    
    # replace emoticons with text
    for i, k in dict_emot.items():
        mod_text = mod_text.replace(i, ' ' + k + ' ')
    
    # replace unrecognized characters
    mod_text = mod_text.replace('İ', 'I')
    
    # expand contractions
    mod_text = contractions.fix(mod_text)

    # remove non-alphabet characters
    mod_text = re.sub(r'[^a-zA-Z0-9]', ' ', mod_text)

    # lower case text
    mod_text = mod_text.lower()

    # truncate repeated characters
    mod_text = re.sub(r'(.)\1{2,}', r'\1\1', mod_text)

    # replace common abbreviations
    for i, k in dict_abbr.items():
        mod_text = mod_text.replace(' ' + i + ' ', ' ' + k + ' ')

    # remove repeated whitespace
    mod_text = re.sub(r'( )\1+', ' ', mod_text)

    return mod_text


In [12]:
# apply function to dataframe
df_data.loc[:, 'text'] = df_data['text'].apply(preprocess)
df_data.head()

Unnamed: 0,target,text
0,0,user url aww that is a bummer you shoulda got...
1,0,is upset that he can not update his facebook b...
2,0,user i dived many times for the ball managed ...
3,0,my whole body feels itchy and like its on fire
4,0,user no it is not behaving at all i am mad wh...


To-do: add stop word removal and lemmatization of words, investigate type correction