# Identify the Sentiments - AV Hackathon

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import nltk

In [3]:
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

### Import Dataset

In [8]:
df = pd.read_csv("/content/drive/MyDrive/Machine Learning Projects/Identify the Sentiments/train.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [9]:
df.shape

(7920, 3)

### Data Preprocessing

In [10]:
# Drop id column
df = df.drop(columns = ["id"])

In [11]:
df.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


In [12]:
preprocessed_tweets = []
for i in df["tweet"]:
  review = i.split()
  review = [word for word in review if not re.findall("^http", word)] # Step - 1 : Remove Websites
  review = " ".join(review)
  review = review.lower() # Step - 2 : Lower the case
  review = re.sub("[^A-Za-z]", " ", review) # Step - 3 : Remove the special characters
  review = review.split()
  review = [word for word in review if word not in stopwords.words("english")] # Step - 4 : Remove Stopwords
  review = [lemmatizer.lemmatize(word) for word in review] # Step - 5: Lemmatization
  review = [lemmatizer.lemmatize(word, pos = "v") for word in review]
  review = " ".join(review)
  preprocessed_tweets.append(review)

In [13]:
preprocessed_tweets[124]

'unpack sony xperia z compact android really really nice first impression pic twitter com blo b osu'

In [14]:
preprocessed_tweets[56]

'black friday call black friday http www boston com news business black friday call black friday blackfridaydeals blackfriday blackfriday news amazon shop shop christmas holiday gift family friend newyork business crowd money birthday tv iphone computer lifehack school'

In [15]:
df["tweet"] = preprocessed_tweets
df.head()

Unnamed: 0,label,tweet
0,0,fingerprint pregnancy test android apps beauti...
1,0,finally transparant silicon case thank uncle y...
2,0,love would go talk makememories unplug relax i...
3,0,wire know george make way iphone cute daventry...
4,1,amaze service apple even talk question unless ...


In [16]:
# Splitting into Dependent and Independent variable
X_train = df.drop(columns = ["label"])
y_train = df["label"]

### Bag of Words Approach

In [17]:
# Convering words to vectors
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(preprocessed_tweets)
X_train = tfidf.transform(preprocessed_tweets)

In [18]:
X_train

<7920x15877 sparse matrix of type '<class 'numpy.float64'>'
	with 88523 stored elements in Compressed Sparse Row format>

In [19]:
## Prepare test data
test_df = pd.read_csv("/content/drive/MyDrive/Machine Learning Projects/Identify the Sentiments/test.csv")
test_df.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [20]:
test_df = test_df.drop(columns = ["id"])
test_df.head()

Unnamed: 0,tweet
0,I hate the new #iphone upgrade. Won't let me d...
1,currently shitting my fucking pants. #apple #i...
2,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,My ipod is officially dead. I lost all my pict...
4,Been fighting iTunes all night! I only want th...


In [21]:
preprocessed_test_tweets = []
for i in test_df["tweet"]:
  review = i.split()
  review = [word for word in review if not re.findall("^http", word)] # Step - 1 : Remove Websites
  review = " ".join(review)
  review = review.lower()
  review = re.sub("[^A-Za-z]", " ", review)
  review = review.split()
  review = [word for word in review if word not in stopwords.words("english")]
  review = [lemmatizer.lemmatize(word) for word in review]
  review = [lemmatizer.lemmatize(word, pos = "v") for word in review]
  review = " ".join(review)
  preprocessed_test_tweets.append(review)

In [24]:
X_test = tfidf.transform(preprocessed_test_tweets)

In [25]:
X_test

<1953x15877 sparse matrix of type '<class 'numpy.float64'>'
	with 18796 stored elements in Compressed Sparse Row format>

## Model Building

In [29]:
from sklearn.naive_bayes import MultinomialNB
MNB_model = MultinomialNB()
MNB_model.fit(X_train, y_train)
predictions = MNB_model.predict(X_test)

In [30]:
submissions = pd.read_csv("/content/drive/MyDrive/Machine Learning Projects/Identify the Sentiments/sample.csv")
submissions.head()

Unnamed: 0,id,label
0,7921,0
1,7922,0
2,7923,0
3,7924,0
4,7925,0


In [31]:
submissions["label"] = predictions
submissions.to_csv("tfidf_Sub.csv", index = False)

#### Competetion Score : 0.819472988415498