In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
clean_data = pd.read_csv('preprocessed_label.csv')
clean_data.shape

(49994, 9)

In [None]:
clean_data.head()

Unnamed: 0,text,label,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons
0,new rule waiting table almost always cant wait...,4,616,111,-0.208864,1,0,1,0
1,flirted giving two star pretty damning rating ...,3,1124,206,0.107488,0,0,0,0
2,staying planet hollywood across street saw goo...,5,593,111,0.364722,0,1,0,0
3,food good price super expensive buck extra lar...,2,706,130,0.030485,0,0,1,0
4,worse company deal horrible work bring truck b...,1,630,124,-0.172321,0,0,0,0


In [None]:
clean_data.label = clean_data.label.astype(int).astype('category')
clean_data.text = clean_data.text.astype(str)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49994 entries, 0 to 49993
Data columns (total 9 columns):
text                  49994 non-null object
label                 49994 non-null category
Text Length           49994 non-null int64
Word Count            49994 non-null int64
Sentiment Polarity    49994 non-null float64
Capital Words         49994 non-null int64
Exclamation           49994 non-null int64
Ques Marks            49994 non-null int64
Emoticons             49994 non-null int64
dtypes: category(1), float64(1), int64(6), object(1)
memory usage: 3.1+ MB


# Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000,sublinear_tf=True,norm='l2',ngram_range=(1,3),analyzer = "word")
X = vectorizer.fit_transform(clean_data['text'])
X.shape

In [None]:
# convert the vector object to dataframe and append the column names with features.
columns = vectorizer.get_feature_names()
df = pd.DataFrame(X.toarray())
df.columns = columns
print(df.shape)

In [None]:
df.head()

In [None]:
df['Sentiment'] = clean_data['Sentiment Polarity']
df['Capital'] = clean_data['Capital Words']
df['Exclamation'] = clean_data['Exclamation']
df['Question'] = clean_data['Ques Marks']
df['Emoticons'] = clean_data['Emoticons']

In [None]:
df.head()

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
# split the dataset to train and test with test size 20%.
# random.seed(2018)
X_train, X_test, y_train, y_test = train_test_split(df, clean_data["label"], test_size = 0.1) 

## Logistic Regression

In [None]:
from sklearn import linear_model

# Multinomial Logistic regression model
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg',C=0.6, 
                                         class_weight="balanced", fit_intercept=False,
                                         max_iter=250, penalty='l2',tol=0.0001).fit(X_train,y_train)

In [None]:
# from sklearn.model_selection import cross_val_score

# # cross validation score
# scores = cross_val_score(mul_lr, X_train, y_train, cv=3)
# scores

In [None]:
from sklearn import metrics

#test the model to predict the test set.
pred = mul_lr.predict(X_test)

# determine the accuracy
metrics.accuracy_score(y_test,pred)

## First Quarter of unlabeled:

In [None]:
data = pd.read_csv('preprocessed_unlabel.csv')
data.shape

(598439, 8)

In [None]:
data_part1 = data[:149999]
data_part1.shape

(149999, 8)

In [None]:
data_part1.head()

Unnamed: 0,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons
0,good experience wife sat bar great pizza wing ...,500,90,0.229545,0,0,0,0
1,first montreal gf came eat nice mid day lunch ...,1624,296,0.326205,0,0,0,0
2,one favorite place go cold rainy re defines co...,143,27,0.3,0,1,0,0
3,doctor very nice got good amount time feel lik...,707,140,0.325179,2,3,0,0
4,nook immediate phoenix staple came met fam not...,1126,197,0.322321,0,1,0,0


In [None]:
data_part1.text = data_part1.text.astype(str)
data_part1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149999 entries, 0 to 149998
Data columns (total 8 columns):
text                  149999 non-null object
Text Length           149999 non-null int64
Word Count            149999 non-null int64
Sentiment Polarity    149999 non-null float64
Capital Words         149999 non-null int64
Exclamation           149999 non-null int64
Ques Marks            149999 non-null int64
Emoticons             149999 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 9.2+ MB


In [None]:
X = vectorizer.transform(data_part1['text'])


In [None]:
# convert the vector object to dataframe and append the column names with features.
columns = vectorizer.get_feature_names()
df = pd.DataFrame(X.toarray())
df.columns = columns
print(df.shape)

(149999, 5000)


In [None]:
df['Sentiment'] = data_part1['Sentiment Polarity']
df['Capital'] = data_part1['Capital Words']
df['Exclamation'] = data_part1['Exclamation']
df['Question'] = data_part1['Ques Marks']
df['Emoticons'] = data_part1['Emoticons']

In [None]:
df.head()

Unnamed: 0,able,able get,absolute,absolutely,absolutely amazing,absolutely delicious,absolutely love,absolutely no,ac,accept,...,yum,yummy,zero,zero star,zucchini,Sentiment,Capital,Exclamation,Question,Emoticons
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190621,...,0.0,0.0,0.0,0.0,0.0,0.229545,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.326205,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.3,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.325179,2,3,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.322321,0,1,0,0


In [None]:
pred1 = mul_lr.predict(df)

In [None]:
len(pred1)

149999

In [None]:
data_part1['label']=list(pred1)
data_part1.head()

Unnamed: 0,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons,label
0,good experience wife sat bar great pizza wing ...,500,90,0.229545,0,0,0,0,3
1,first montreal gf came eat nice mid day lunch ...,1624,296,0.326205,0,0,0,0,4
2,one favorite place go cold rainy re defines co...,143,27,0.3,0,1,0,0,5
3,doctor very nice got good amount time feel lik...,707,140,0.325179,2,3,0,0,1
4,nook immediate phoenix staple came met fam not...,1126,197,0.322321,0,1,0,0,5


In [None]:
labeled = pd.read_csv('preprocessed_label.csv')
labeled = labeled[['text', 'Text Length','Word Count','Sentiment Polarity','Capital Words','Exclamation',\
                  'Ques Marks','Emoticons','label']]
labeled.head()

Unnamed: 0,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons,label
0,new rule waiting table almost always cant wait...,616,111,-0.208864,1,0,1,0,4
1,flirted giving two star pretty damning rating ...,1124,206,0.107488,0,0,0,0,3
2,staying planet hollywood across street saw goo...,593,111,0.364722,0,1,0,0,5
3,food good price super expensive buck extra lar...,706,130,0.030485,0,0,1,0,2
4,worse company deal horrible work bring truck b...,630,124,-0.172321,0,0,0,0,1


In [None]:
combined=labeled.append(data_part1,ignore_index=True)

In [None]:
combined.shape

(199993, 9)

In [None]:
# making a model again
combined.label = combined.label.astype(int).astype('category')
combined.text = combined.text.astype(str)
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199993 entries, 0 to 199992
Data columns (total 9 columns):
text                  199993 non-null object
Text Length           199993 non-null int64
Word Count            199993 non-null int64
Sentiment Polarity    199993 non-null float64
Capital Words         199993 non-null int64
Exclamation           199993 non-null int64
Ques Marks            199993 non-null int64
Emoticons             199993 non-null int64
label                 199993 non-null category
dtypes: category(1), float64(1), int64(6), object(1)
memory usage: 12.4+ MB


In [None]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000,sublinear_tf=True,norm='l2',ngram_range=(1,3),analyzer = "word")
X = vectorizer.fit_transform(combined['text'])
X.shape

(199993, 5000)

In [None]:
# convert the vector object to dataframe and append the column names with features.
columns = vectorizer.get_feature_names()
df = pd.DataFrame(X.toarray())
df.columns = columns
print(df.shape)

(199993, 5000)


In [None]:
# df.reindex()
# combined.reindex()

In [None]:
df['Sentiment'] = combined['Sentiment Polarity']
df['Capital'] = combined['Capital Words']
df['Exclamation'] = combined['Exclamation']
df['Question'] = combined['Ques Marks']
df['Emoticons'] = combined['Emoticons']

In [None]:
df.head()

Unnamed: 0,ability,able,able get,absolute,absolutely,absolutely delicious,absolutely love,absolutely no,ac,accept,...,yum,yummy,zero,zero star,zucchini,Sentiment,Capital,Exclamation,Question,Emoticons
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.208864,1,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.107488,0,0,0,0
2,0.0,0.104349,0.0,0.0,0.1072,0.167016,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.364722,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.030485,0,0,1,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.172321,0,0,0,0


In [None]:
# splitting the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, combined["label"], test_size = 0.2) 

In [None]:
# y_train=combined["label"]
# X_train=df

In [None]:
from sklearn import linear_model

# Multinomial Logistic regression model
mul_lr = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg',C=0.6, 
                                         class_weight="balanced", fit_intercept=False,
                                         max_iter=250, penalty='l2',tol=0.0001).fit(X_train,y_train)

In [None]:
from sklearn import metrics

#test the model to predict the test set.
pred = mul_lr.predict(X_test)

# determine the accuracy
metrics.accuracy_score(y_test,pred)

0.8606465161629041

## test data

In [None]:
data = pd.read_csv('preprocessed_test.csv')
data.shape

(50000, 9)

In [None]:
# data_part1 = unlabeled_data[:149999]
# data_part1.shape

In [None]:
data.head()

Unnamed: 0,test_id,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons
0,test_1,trying nice quiet dinner announcer award givea...,109,20,0.233333,0,0,0,0
1,test_2,getting food go yr wife usually tend get same ...,659,124,0.158929,0,0,0,0
2,test_3,ugh ve eat couple time work event course make ...,1335,248,0.047309,7,3,0,0
3,test_4,people nice ordered eat promptly called double...,174,32,0.3375,0,3,0,1
4,test_5,heard alot good thing place decided grab break...,529,95,0.304625,0,2,0,0


In [None]:
data.text = data.text.astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
test_id               50000 non-null object
text                  50000 non-null object
Text Length           50000 non-null int64
Word Count            50000 non-null int64
Sentiment Polarity    50000 non-null float64
Capital Words         50000 non-null int64
Exclamation           50000 non-null int64
Ques Marks            50000 non-null int64
Emoticons             50000 non-null int64
dtypes: float64(1), int64(6), object(2)
memory usage: 3.4+ MB


In [None]:
X = vectorizer.transform(data['text'])

In [None]:
# convert the vector object to dataframe and append the column names with features.
columns = vectorizer.get_feature_names()
df = pd.DataFrame(X.toarray())
df.columns = columns
print(df.shape)

(50000, 5000)


In [None]:
df['Sentiment'] = data['Sentiment Polarity']
df['Capital'] = data['Capital Words']
df['Exclamation'] = data['Exclamation']
df['Question'] = data['Ques Marks']
df['Emoticons'] = data['Emoticons']

In [None]:
df.head()

Unnamed: 0,able,able get,absolute,absolutely,absolutely amazing,absolutely delicious,absolutely love,absolutely no,ac,accept,...,yum,yummy,zero,zero star,zucchini,Sentiment,Capital,Exclamation,Question,Emoticons
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.233333,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.158929,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.047309,7,3,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.3375,0,3,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.304625,0,2,0,0


In [None]:
pred1 = mul_lr.predict(df)

In [None]:
pred1

array([3, 3, 1, ..., 2, 5, 4], dtype=int64)

In [None]:
data['label']=list(pred1)
data.head()

Unnamed: 0,test_id,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons,label
0,test_1,trying nice quiet dinner announcer award givea...,109,20,0.233333,0,0,0,0,3
1,test_2,getting food go yr wife usually tend get same ...,659,124,0.158929,0,0,0,0,3
2,test_3,ugh ve eat couple time work event course make ...,1335,248,0.047309,7,3,0,0,1
3,test_4,people nice ordered eat promptly called double...,174,32,0.3375,0,3,0,1,5
4,test_5,heard alot good thing place decided grab break...,529,95,0.304625,0,2,0,0,5


In [None]:
data.head()

Unnamed: 0,test_id,text,Text Length,Word Count,Sentiment Polarity,Capital Words,Exclamation,Ques Marks,Emoticons,label
0,test_1,trying nice quiet dinner announcer award givea...,109,20,0.233333,0,0,0,0,3
1,test_2,getting food go yr wife usually tend get same ...,659,124,0.158929,0,0,0,0,3
2,test_3,ugh ve eat couple time work event course make ...,1335,248,0.047309,7,3,0,0,1
3,test_4,people nice ordered eat promptly called double...,174,32,0.3375,0,3,0,1,5
4,test_5,heard alot good thing place decided grab break...,529,95,0.304625,0,2,0,0,5


In [None]:
data.shape

(50000, 10)

In [None]:
kaggle=data[['test_id','label']]

In [None]:
kaggle.head()

Unnamed: 0,test_id,label
0,test_1,3
1,test_2,3
2,test_3,1
3,test_4,5
4,test_5,5


In [None]:
kaggle.to_csv('kaggle1.csv', encoding='utf-8', index=False)