In [125]:
import pandas as pd
data = pd.read_csv('l&t_review_and_rating_ambitionbox.csv')
print(data.shape)
data.head()

(9990, 2)


Unnamed: 0,review,rating
0,I would say people must work in L&T once to kn...,2.0
1,1. Overall a good company to work for! 2. Time...,4.0
2,I have worked with L&T and found that the top ...,5.0
3,"Good Learning, Freedom of Work, Job Security, ...",4.0
4,Company culture is good but only for payroll e...,3.0


- Creating sentiment column from rating

In [126]:
def rating_2_sentiment(rating) :
    if rating>3:
        return 'positive'
    else:
        return 'negative'

data["sentiment"] = data['rating'].apply(lambda x: rating_2_sentiment(x))

In [127]:
data.sentiment.value_counts()

sentiment
positive    7391
negative    2599
Name: count, dtype: int64

- Removing HTML tags, URLs, Stopwords (commonly used words like ‘and’, ‘the’, ‘at’ that do not hold any special meaning in a sentence), and  non-alphanumeric characters from the dataset.

In [128]:
import re
import nltk

In [129]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ashutosh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [130]:
def remove_tags(string):
    removelist = ""
    result = re.sub('','',string)                         #remove HTML tags
    result = re.sub('https://.*','',result)               #remove URLs
    result = re.sub("[^a-zA-Z0-9]", ' ',result)    #remove non-alphanumeric characters 
    result = result.lower()
    return result
data['review']=data['review'].apply(lambda x : remove_tags(str(x))) 


In [131]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add("I")
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

- break text in individual words using WhitespaceTokenizer(), then use function lemmatize_text to perform lemmatization on individual words 

In [132]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ashutosh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [133]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['review'] = data.review.apply(lemmatize_text)

In [134]:
data.head(10)

Unnamed: 0,review,rating,sentiment
0,would say people must work l know could worse ...,2.0,negative
1,1 overall good company work 2 timely payment s...,4.0,positive
2,worked l found top management always support s...,5.0,positive
3,good learning freedom work job security develo...,4.0,positive
4,company culture good payroll employee m1a cadr...,3.0,negative
5,continuous followup work top bottom work life ...,3.0,negative
6,civil supervisor electric fair laine haide wor...,5.0,positive
7,nothing like lnt politics every stage trust an...,2.0,negative
8,l construction multiple company future growth ...,5.0,positive
9,staff supporting eachother even living communi...,4.0,positive


- Encoding Labels and Making Train-Test Splits

In [135]:
from sklearn.preprocessing import LabelEncoder

In [136]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

- The dataset is then split into 80% train and 20% test parts using train_test_split from sklearn.model_selection.

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, test_size=0.2, random_state=42, shuffle=True, stratify = encoded_labels)

- Vectorize text reviews to numbers

In [139]:
from sklearn.feature_extraction.text import CountVectorizer

In [140]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
X = X.toarray()
x_test = vec.transform(test_sentences).toarray()

- Model Generation

In [141]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X, train_labels)

In [142]:
model.score(x_test, test_labels)

0.7837837837837838

In [143]:
model.score(X, train_labels)

0.8258258258258259

- Use logistic regression

In [144]:
from sklearn.linear_model import LogisticRegression

In [145]:
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

In [146]:
lr.fit(X, train_labels)

In [147]:
lr.score(x_test, test_labels)

0.7812812812812813

In [148]:
lr.score(X, train_labels)

0.8511011011011012

- Gaussian Naive Bayes

In [149]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X, train_labels)

In [155]:
gnb.score(x_test, test_labels)

0.32482482482482483

In [151]:
gnb.score(X, train_labels)

0.41453953953953954