<a href="https://colab.research.google.com/github/nikhil697/Cleaning-Data-Using-NLP/blob/main/102103482_nlp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sentiment Analysis for Tweets- Pre Processing using NLP**

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP2/tweet_emotions.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


**Checking for null Values**

In [5]:
print(data.isnull().sum())

tweet_id     0
sentiment    0
content      0
dtype: int64


**Removing the unwanted symbols and special characters**

In [6]:
import re
data['content'] = data['content'].apply(lambda x: re.sub(r'@\w+', '', x))

data['content'] = data['content'].apply(lambda x: re.sub(r'http\S+', '', x))

data['content'] = data['content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

data['content'] = data['content'].str.lower()

In [7]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,layin n bed with a headache ughhhhwaitin on y...
2,1956967696,sadness,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends soon
4,1956968416,neutral,we want to trade with someone who has houston...


In [8]:
data.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

**Removing Duplicates**

In [9]:
print(data.duplicated().sum())

0


In [20]:
# If uplicates were there

#data_no_duplicates_content = data.drop_duplicates(subset=['content'])

In [10]:
data['content'] = data['content'].str.replace('/n', '')

**Tokenization and Lemmatization**

In [11]:
import nltk  #Natural Language Toolkit
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

data['tokens'] = data['content'].apply(lambda x: word_tokenize(x))

lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

stop_words = set(stopwords.words('english'))
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
data.head()

Unnamed: 0,tweet_id,sentiment,content,tokens,lemmatized
0,1956967341,empty,know listenin bad habit earlier started freaki...,"[i, know, i, was, listenin, to, bad, habit, ea...","[i, know, i, wa, listenin, to, bad, habit, ear..."
1,1956967666,sadness,layin n bed headache ughhhhwaitin call,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, with, a, headache, ughhhhwaiti..."
2,1956967696,sadness,funeral ceremonygloomy friday,"[funeral, ceremonygloomy, friday]","[funeral, ceremonygloomy, friday]"
3,1956967789,enthusiasm,wants hang friends soon,"[wants, to, hang, out, with, friends, soon]","[want, to, hang, out, with, friend, soon]"
4,1956968416,neutral,want trade someone houston tickets one,"[we, want, to, trade, with, someone, who, has,...","[we, want, to, trade, with, someone, who, ha, ..."


In [13]:
data.to_pickle("cleaned_data.pkl")
cleaned_data=data

## **Lets Organise The Data**

**Corpus**

In [14]:
corpus = data['content'].tolist()

In [15]:
data.to_pickle("corpus.pkl")
data

Unnamed: 0,tweet_id,sentiment,content,tokens,lemmatized
0,1956967341,empty,know listenin bad habit earlier started freaki...,"[i, know, i, was, listenin, to, bad, habit, ea...","[i, know, i, wa, listenin, to, bad, habit, ear..."
1,1956967666,sadness,layin n bed headache ughhhhwaitin call,"[layin, n, bed, with, a, headache, ughhhhwaiti...","[layin, n, bed, with, a, headache, ughhhhwaiti..."
2,1956967696,sadness,funeral ceremonygloomy friday,"[funeral, ceremonygloomy, friday]","[funeral, ceremonygloomy, friday]"
3,1956967789,enthusiasm,wants hang friends soon,"[wants, to, hang, out, with, friends, soon]","[want, to, hang, out, with, friend, soon]"
4,1956968416,neutral,want trade someone houston tickets one,"[we, want, to, trade, with, someone, who, has,...","[we, want, to, trade, with, someone, who, ha, ..."
...,...,...,...,...,...
39995,1753918954,neutral,,[],[]
39996,1753919001,love,happy mothers day love,"[happy, mothers, day, all, my, love]","[happy, mother, day, all, my, love]"
39997,1753919005,love,happy mothers day mommies woman man long youre...,"[happy, mothers, day, to, all, the, mommies, o...","[happy, mother, day, to, all, the, mommy, out,..."
39998,1753919043,happiness,wassup beautiful follow peep new hit singles w...,"[wassup, beautiful, follow, me, peep, out, my,...","[wassup, beautiful, follow, me, peep, out, my,..."


**Document Term Matrix**

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cleaned_data = pd.read_pickle("cleaned_data.pkl")

cv = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.8)

data_cv = cv.fit_transform(cleaned_data['content'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = cleaned_data.index
print(data_dtm)
data_dtm.to_pickle("document_term_matrix.pkl")


       aa  aaa  aaaa  aaaaaaaaaamazing  aaaaaah  aaaaahhhh  aaaah  aaaand  \
0       0    0     0                 0        0          0      0       0   
1       0    0     0                 0        0          0      0       0   
2       0    0     0                 0        0          0      0       0   
3       0    0     0                 0        0          0      0       0   
4       0    0     0                 0        0          0      0       0   
...    ..  ...   ...               ...      ...        ...    ...     ...   
39995   0    0     0                 0        0          0      0       0   
39996   0    0     0                 0        0          0      0       0   
39997   0    0     0                 0        0          0      0       0   
39998   0    0     0                 0        0          0      0       0   
39999   0    0     0                 0        0          0      0       0   

       aaaand im  aaah  ...  zoo  zoo fun  zs  zs high  zu  zune  zune hd  

# **Additional Assignment**

Play around with CountVectorizer's parameters. What is ngram_range? What is min_df and max_df?

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))
dtm_ngram = vectorizer_ngram.fit_transform(corpus)

vectorizer_df = CountVectorizer(min_df=2, max_df=0.8)
dtm_df_new = vectorizer_df.fit_transform(corpus)

In [18]:
dtm_df_new

<40000x12351 sparse matrix of type '<class 'numpy.int64'>'
	with 255510 stored elements in Compressed Sparse Row format>