In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import re


In [3]:
#load the dataset
df = pd.read_csv('../data/raw/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']

In [5]:
#Display the first few rows
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
#check the shape of the dataset
print(f'Dataset ontains {df.shape[0]} rows and {df.shape[1]} columns.')

Dataset ontains 1600000 rows and 6 columns.


In [7]:
#Check for missing values
print(df.isnull().sum())

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64


In [8]:
#Get basic statistics
df.describe()

Unnamed: 0,sentiment,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [9]:
#Check the distribution of sentiment
df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [10]:
#Remove dups
df.drop_duplicates(inplace=True)
print(f'Dataset contains {df.shape[0]} rows after removing duplicates.')

Dataset contains 1600000 rows after removing duplicates.


In [11]:
#Check for missing values in the text column
print(df['text'].isnull().sum())

0


In [12]:
#Normalize text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [13]:
# Apply normalization
df['text'] = df['text'].apply(normalize_text)

In [14]:
# Display the first few rows after normalization
df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,awww thats a bummer you shoulda got david ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many times for the ball managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no its not behaving at all im mad why am i he...
