In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('data/tweets.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Data Preprocessing

In [3]:
# drop unwanted columns
df.drop(['textID', 'selected_text'], axis=1, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       27480 non-null  object
 1   sentiment  27481 non-null  object
dtypes: object(2)
memory usage: 429.5+ KB


In [5]:
# drop nan value from dataframe
df.dropna(inplace=True)

In [6]:
df.isna().sum()

text         0
sentiment    0
dtype: int64

In [7]:
# conver the text into string and lower case
df['text'] = df['text'].astype(str).str.lower()

In [8]:
#remove urls, user mentions, hashtags, numbers and special character
df['text'] = df['text'].str.replace(r'https?://\S+|@\w+|#|\d|[^\w\s]', '', regex=True) 

In [9]:
# tokenization
df['token'] = df['text'].apply(word_tokenize)

In [10]:
# remove stopwords
stop_words = set(stopwords.words('english'))
df['token'] = df['token'].apply(lambda x: [value for value in x if value not in stop_words])

In [11]:
df.head()

Unnamed: 0,text,sentiment,token
0,id have responded if i were going,neutral,"[id, responded, going]"
1,sooo sad i will miss you here in san diego,negative,"[sooo, sad, miss, san, diego]"
2,my boss is bullying me,negative,"[boss, bullying]"
3,what interview leave me alone,negative,"[interview, leave, alone]"
4,sons of why couldnt they put them on the rel...,negative,"[sons, couldnt, put, releases, already, bought]"
