In [4]:
import pandas as pd
# DATASET
! unzip ./dataset/*.zip
! mv *.csv data.csv

DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

path="./dataset/data.csv"
df= pd.read_csv(path,encoding =DATASET_ENCODING , names=DATASET_COLUMNS)

Analyze the dataset

In [6]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [8]:
%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: user 345 ms, sys: 0 ns, total: 345 ms
Wall time: 348 ms


In [12]:
from collections import Counter
target_cnt = Counter(df.target)
print(target_cnt)

Counter({'NEGATIVE': 800000, 'POSITIVE': 800000})


Preprocess the dataset
1. drop unnecessery cols

In [13]:
df = df.drop(['ids', 'date', 'flag', 'user'], axis=1)
df.head()

Unnamed: 0,target,text
0,NEGATIVE,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,NEGATIVE,is upset that he can't update his Facebook by ...
2,NEGATIVE,@Kenichan I dived many times for the ball. Man...
3,NEGATIVE,my whole body feels itchy and like its on fire
4,NEGATIVE,"@nationwideclass no, it's not behaving at all...."


2. clean the texts

In [15]:
%%time

import re
# Function to clean the text
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()
df['text'] = df['text'].apply(clean_text)

CPU times: user 13.8 s, sys: 144 ms, total: 13.9 s
Wall time: 14.3 s


In [16]:
df.head()

Unnamed: 0,target,text
0,NEGATIVE,http twitpic com y zl awww that s a bummer you...
1,NEGATIVE,is upset that he can t update his facebook by ...
2,NEGATIVE,i dived many times for the ball managed to sav...
3,NEGATIVE,my whole body feels itchy and like its on fire
4,NEGATIVE,no it s not behaving at all i m mad why am i h...


In [17]:
# nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from  nltk.stem import SnowballStemmer


stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
%%time

def preprocess(text, stem=False):
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 34.4 s, sys: 95.1 ms, total: 34.5 s
Wall time: 35.3 s


In [20]:
df.head()

Unnamed: 0,target,text
0,NEGATIVE,http twitpic com zl awww bummer shoulda got da...
1,NEGATIVE,upset update facebook texting might cry result...
2,NEGATIVE,dived many times ball managed save rest go bounds
3,NEGATIVE,whole body feels itchy like fire
4,NEGATIVE,behaving mad see
