In [3]:
import pandas as pd
import seaborn as sns
import plotly.express as px

from text_preprocessing import preprocess_text

In [60]:
# Load the data
df = pd.read_csv('data/Twitter_Data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [61]:
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples,-1.0
1,talk all the nonsense and continue all the drama will vote for modi,0.0
2,what did just say vote for modi welcome bjp told you rahul the main campaigner for modi think modi should just relax,1.0
3,asking his supporters prefix chowkidar their names modi did great service now there confusion what read what not now crustal clear what will crass filthy nonsensical see how most abuses are coming from chowkidars,1.0
4,answer who among these the most powerful world leader today trump putin modi may,1.0


In [62]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Check duplicated values
df.duplicated().sum()

0

In [63]:
# Drop Null values
df.dropna(how='any', inplace=True)

# Check Null values
df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [64]:
# Rename columns
df.columns = ['tweet', 'sentiment']

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tweet      162969 non-null  object 
 1   sentiment  162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [65]:
# Apply text preprocessing 
# df['tweet'] = df['tweet'].astype(str).apply(preprocess_text)

In [66]:
# Remove consecutive duplicated words
df['tweet'] = df['tweet'].str.replace(r'\b(\w+)(\s+\1)+\b', r'\1')


The default value of regex will change from True to False in a future version.



In [67]:
# Drop short sentences
df['length'] = df['tweet'].apply(lambda str: len(str.split()))
df = df.loc[~(df['length'] < 2)]

In [68]:
# Cuda excepts consequential labels in the range 0 to C-1,
# where C is the number of classes
df['sentiment'] = df['sentiment'].apply(lambda x: x+1.)

df['sentiment'].unique()

array([0., 1., 2.])

In [73]:
fig = px.bar(df, x=df['sentiment'].unique(), 
             y=df['sentiment'].value_counts(),
             template='plotly_dark', 
             color=df.sentiment.unique(),
             text_auto=True, 
             height=400, 
             width=400)
fig.show()

In [46]:
max_length = len(df[df.sentiment == 1.])

# Select rows randomly, where the value of the 'Sentiment' columns is 2.0 (Positive)
to_drop = df[df.sentiment == 2.].sample(len(df[df.sentiment == 2.]) 
                                        -max_length).index

# Drop those rows
df.drop(to_drop, inplace=True)

In [16]:
df['sentiment'].value_counts()

1.0    54453
2.0    54453
0.0    35485
Name: sentiment, dtype: int64

In [47]:
# Importing the dataset
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

tdf = pd.read_csv("data/sent140.csv", 
                     encoding=DATASET_ENCODING,
                     names=DATASET_COLUMNS)

In [18]:
tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   ids        1600000 non-null  int64 
 2   date       1600000 non-null  object
 3   flag       1600000 non-null  object
 4   user       1600000 non-null  object
 5   text       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [48]:
# Drop irrelevant columns
cols_to_drop = ["ids", "date", "flag", "user"]

tdf.drop(cols_to_drop, axis=1, inplace=True)

# Rename the columns so that they met an original df
tdf.columns = ["sentiment", "tweet"]

In [20]:
tdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   tweet      1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [49]:
# Drop duplicates
tdf.drop_duplicates(inplace=True)

# Check duplicated values
tdf.duplicated().sum()

0

In [50]:
# Drop Null values
tdf.dropna(how='any', inplace=True)

# Check Null values
tdf.isnull().sum()

sentiment    0
tweet        0
dtype: int64

In [51]:
tdf = tdf[tdf.sentiment == 0].sample(50000)

In [52]:
# Convert tweets to str and and preprocess
tdf['tweet'] = tdf['tweet'].apply(str)
tdf['tweet'] = tdf['tweet'].apply(preprocess_text)

# Drop short sentences
tdf['length'] = tdf['tweet'].apply(lambda str: len(str.split()))
tdf = tdf.loc[~(tdf['length'] < 2)]

In [53]:
# Select rows randomly, where the value of the 'Sentiment' columns is 0 (Negative)
to_add = tdf[tdf.sentiment == 0].sample(
    max_length - len(df[df.sentiment == 0.]))

In [54]:
# Concatanate two datasets
df = pd.concat([df, to_add], axis=0)

In [58]:
fig = px.bar(df, x=df['sentiment'].unique(), 
             y=df['sentiment'].value_counts(),
             template='plotly_dark', 
             color=df.sentiment.unique(),
             text_auto=True, 
             height=400, 
             width=400)
fig.show()

In [28]:
# Export the data to csv
df.to_json("inputs/train.json", orient='records', lines=True)