#Sentiment140 Dataset Cleaning and Featuring Engineering

In [None]:
import json
import os

# Replace with your actual username and key from Kaggle settings
info = {"username": "kaggleuser", "key": "kagglekey"}

# Create the .kaggle directory and save the file
os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "w") as f:
    json.dump(info, f)

# Set proper permissions
os.chmod("/root/.kaggle/kaggle.json", 600)

## Kaggle dataset download

In [None]:
!pip install kaggle

# Downloading and unzipping the Sentiment140 dataset
!kaggle datasets download -d kazanova/sentiment140
!unzip sentiment140.zip

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 93% 75.0M/80.9M [00:00<00:00, 783MB/s]
100% 80.9M/80.9M [00:00<00:00, 784MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


## Filtering out noises

In [None]:
import re

#Removing texts that includes links, emails and converting them to lowercase
def clean_tweet(text):
  text = text.lower()
  text = re.sub(r'http\S+', '', text)
  text = re.sub(r'@\S+', '', text)

  return text.strip()

In [None]:
raw_tweet = "@User123 I LOVE this movie! http://coolmovies.com"
print(clean_tweet(raw_tweet))

i love this movie!


##Keeping only text and target for **training**

In [None]:
import pandas as pd

df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='latin-1',
                 names=['target', 'id', 'date', 'flag', 'user', 'text'])

df = df[['text', 'target']]

In [None]:
df.head()

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [None]:
len(df)

1600000

##Performing Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#Initializing the vectorizer to keep only the top 1000 words to avoid memory runout
vectorizer = CountVectorizer(max_features=1000)

#Transforming the top 10000 words to number for testing
X = vectorizer.fit_transform(df['text'].sample(10000))

In [None]:
## Ignore this! Just a scratchpad to keep all the previous code together and increase
## sample batch to 50000
import pandas as pd
import re

df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='latin-1',
                 names=['target', 'id', 'date', 'flag', 'user', 'text'])

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    return text.strip()

#Increase the sample size to 50k !
df_sample = df.sample(50000)
df_sample['clean_text'] = df_sample['text'].apply(clean_text)

In [None]:
df_sample.head()

Unnamed: 0,target,id,date,flag,user,text,clean_text
577266,0,2212433264,Wed Jun 17 14:05:20 PDT 2009,NO_QUERY,Miss_Amarantha,"shit my bro comes home, gotta leave his pc now...","shit my bro comes home, gotta leave his pc now..."
177569,0,1965695245,Fri May 29 16:22:31 PDT 2009,NO_QUERY,slvrleo21,homework on a friday night...lame,homework on a friday night...lame
1278732,4,2001457344,Tue Jun 02 00:57:03 PDT 2009,NO_QUERY,LisaRez,@DaveJMatthews you killed it on Fuse! congrats...,you killed it on fuse! congrats - can't wait t...
990008,4,1834941116,Mon May 18 04:40:50 PDT 2009,NO_QUERY,katehughes,@jiminthemorning i am actually one too. totall...,i am actually one too. totally agree. we're yummy
31400,0,1564007325,Mon Apr 20 01:18:06 PDT 2009,NO_QUERY,MattJarryAstley,"What to do, what to do? I should really do som...","what to do, what to do? i should really do som..."


In [None]:
len(df_sample)

50000

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1500) # Only keep the top 1500 words
X = vectorizer.fit_transform(df_sample['clean_text'])

##Filtering Stop words

In [None]:
import nltk
from nltk.corpus import stopwords

# Downloading the list from nltk library
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Keeping 'not' and 'no' for sentiment!
words_to_keep = {'not', 'no', 'never', 'neither', 'nor'}
stop_words = stop_words - words_to_keep

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

# Now applying it to our cleaned text as before!
df_sample['clean_text'] = df_sample['clean_text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


##Feature Scaling

In [None]:
from sklearn.preprocessing import Normalizer

# 1. Initialize the Normalizer
# 'l2' is the most common norm; it ensures the sum of squares for each row equals 1.
scaler = Normalizer(norm='l2')

# 2. Scale the Bag of Words matrix
X_scaled = scaler.fit_transform(X)

##Verification

In [None]:
import numpy as np

# Picking the first row of your scaled matrix
row = X_scaled[0].toarray()

# Calculating the norm (it should be 1.0 or very close to it)
print(np.linalg.norm(row))

1.0


##Batch Processing on Entire Dataset

In [None]:
import pandas as pd

#Using chunksize to load 100,000 rows at a time
chunk_size = 100000
chunks = pd.read_csv('training.1600000.processed.noemoticon.csv',
                     encoding='latin-1',
                     names=['target', 'id', 'date', 'flag', 'user', 'text'],
                     chunksize=chunk_size)

cleaned_data = []

for chunk in chunks:
    #Applying our cleaning function to each small batch
    chunk['text'] = chunk['text'].apply(clean_text)
    cleaned_data.append(chunk[['text', 'target']])

#Combining everything back at the end
df_final = pd.concat(cleaned_data)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=10, max_df=0.95, max_features=5000)
X = vectorizer.fit_transform(df_final['text'])

In [None]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
X_scaled = scaler.fit_transform(X)

##Verification on Feature Scaling Entire Dataset

In [None]:
import numpy as np
from scipy.sparse import linalg

#Performing checks on only non-zero entries in the sparse matrix
row_norms = np.sqrt(np.array(X_scaled.power(2).sum(axis=1)))

#Checking scaled feature against benchmark of 1.0 (ideal)
is_scaled = np.allclose(row_norms, 1.0)

print(f"Average norm: {np.mean(row_norms)}")

Average norm: 0.994069375


##Download the Scaled Dataset


In [None]:
# Save the cleaned sample to a new CSV file
df_sample.to_csv('cleaned_sentiment_data.csv', index=False)

# In Colab, you can download it to your computer or save to Google Drive
from google.colab import files
files.download('cleaned_sentiment_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>