# Importing Necessary Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Loading The Data Set

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submit = pd.read_csv("sample_submission.csv")

# Viewing the Sample Submission Format

In [3]:
sample_submit.head()

Unnamed: 0,User_ID,Is_Response
0,id80132,not_happy
1,id80133,happy
2,id80134,happy
3,id80135,not_happy
4,id80136,happy


# Viewing the Train data set

In [4]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


# Viewing the Test data Set

In [5]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


# Checking the shape of the Train Data Set

In [6]:
train.shape

(38932, 5)

# Checking the shape of the Test Data Set

In [7]:
test.shape

(29404, 4)

# Checking the Difference between the test and train data set

In [8]:
train.columns.difference(test.columns)

Index([u'Is_Response'], dtype='object')

# Creating a column and separating it with test and train data and concat as new data set

In [9]:
train["source"] = "train"
test["source"] = "test"

In [10]:
dataf = pd.concat([train,test],axis=0,sort=False)

# Checking the Na values in the data set

In [11]:
dataf.isna().sum()

User_ID             0
Description         0
Browser_Used        0
Device_Used         0
Is_Response     29404
source              0
dtype: int64

# checking the information about the data set

In [12]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68336 entries, 0 to 29403
Data columns (total 6 columns):
User_ID         68336 non-null object
Description     68336 non-null object
Browser_Used    68336 non-null object
Device_Used     68336 non-null object
Is_Response     38932 non-null object
source          68336 non-null object
dtypes: object(6)
memory usage: 2.1+ MB


# Checking the stats of the data set

In [13]:
dataf.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,source
count,68336,68336,68336,68336,38932,68336
unique,68336,68336,11,3,2,2
top,id85509,My husband and son recently stayed at this hot...,Firefox,Desktop,happy,train
freq,1,1,13043,26375,26521,38932


# Checking the unique values in the data set

In [14]:
dataf.nunique()

User_ID         68336
Description     68336
Browser_Used       11
Device_Used         3
Is_Response         2
source              2
dtype: int64

# Checking the shape of new data set

In [15]:
dataf.shape

(68336, 6)

# checking the columns in the new data set

In [16]:
dataf.columns

Index([u'User_ID', u'Description', u'Browser_Used', u'Device_Used',
       u'Is_Response', u'source'],
      dtype='object')

# checking for the duplicate in the data set

In [17]:
dataf.duplicated().sum()

0

# Number of Words

In [18]:
dataf['word_count']=dataf['Description'].apply(lambda x: len(str(x).split(" ")))
dataf[['Description','word_count']].head()

Unnamed: 0,Description,word_count
0,The room was kind of clean but had a VERY stro...,46
1,I stayed at the Crown Plaza April -- - April -...,208
2,I booked this hotel through Hotwire at the low...,229
3,Stayed here with husband and sons on the way t...,93
4,My girlfriends and I stayed here to celebrate ...,294


# Number of characters

In [19]:
dataf['char_count'] = dataf['Description'].str.len() ## this also includes spaces
dataf[['Description','char_count']].head()

Unnamed: 0,Description,char_count
0,The room was kind of clean but had a VERY stro...,248
1,I stayed at the Crown Plaza April -- - April -...,1077
2,I booked this hotel through Hotwire at the low...,1327
3,Stayed here with husband and sons on the way t...,502
4,My girlfriends and I stayed here to celebrate ...,1613


# Average Word Length

In [20]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

dataf['avg_word'] = dataf['Description'].apply(lambda x: avg_word(x))
dataf[['Description','avg_word']].head()

Unnamed: 0,Description,avg_word
0,The room was kind of clean but had a VERY stro...,4
1,I stayed at the Crown Plaza April -- - April -...,4
2,I booked this hotel through Hotwire at the low...,4
3,Stayed here with husband and sons on the way t...,4
4,My girlfriends and I stayed here to celebrate ...,4


# Importing NLTK package

In [21]:
import nltk

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Number of stopwords

In [26]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
dataf['stopwords'] = train['Description'].apply(lambda x: len([x for x in x.split() if x in stop]))
dataf[['Description','stopwords']].head()

Unnamed: 0,Description,stopwords
0,The room was kind of clean but had a VERY stro...,23
1,I stayed at the Crown Plaza April -- - April -...,82
2,I booked this hotel through Hotwire at the low...,91
3,Stayed here with husband and sons on the way t...,36
4,My girlfriends and I stayed here to celebrate ...,127


# Number of special characters

In [27]:
dataf['SpecialChar'] = dataf['Description'].apply(lambda x: len([x for x in x.split() if x.startswith('-')]))
dataf[['Description','SpecialChar']].head()

Unnamed: 0,Description,SpecialChar
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,6
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,0
4,My girlfriends and I stayed here to celebrate ...,8


# Number of numerics

In [28]:
dataf['numerics'] = dataf['Description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
dataf[['Description','numerics']].head()

Unnamed: 0,Description,numerics
0,The room was kind of clean but had a VERY stro...,0
1,I stayed at the Crown Plaza April -- - April -...,0
2,I booked this hotel through Hotwire at the low...,0
3,Stayed here with husband and sons on the way t...,0
4,My girlfriends and I stayed here to celebrate ...,0


# Number of Uppercase words

In [29]:
dataf['upper'] = dataf['Description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
dataf[['Description','upper']].head()

Unnamed: 0,Description,upper
0,The room was kind of clean but had a VERY stro...,1
1,I stayed at the Crown Plaza April -- - April -...,8
2,I booked this hotel through Hotwire at the low...,9
3,Stayed here with husband and sons on the way t...,0
4,My girlfriends and I stayed here to celebrate ...,7


# Lower casing 

In [30]:
dataf['Description'] = dataf['Description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
dataf['Description'].head()

0    the room was kind of clean but had a very stro...
1    i stayed at the crown plaza april -- - april -...
2    i booked this hotel through hotwire at the low...
3    stayed here with husband and sons on the way t...
4    my girlfriends and i stayed here to celebrate ...
Name: Description, dtype: object

# Removing Punctuation

In [31]:
dataf['Description'] = dataf['Description'].str.replace('[^\w\s]','')
dataf['Description'].head()

0    the room was kind of clean but had a very stro...
1    i stayed at the crown plaza april   april   th...
2    i booked this hotel through hotwire at the low...
3    stayed here with husband and sons on the way t...
4    my girlfriends and i stayed here to celebrate ...
Name: Description, dtype: object

# Removal of Stop Words

In [32]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
dataf['Description'] = dataf['Description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
dataf['Description'].head()

0    room kind clean strong smell dogs generally av...
1    stayed crown plaza april april staff friendly ...
2    booked hotel hotwire lowest price could find g...
3    stayed husband sons way alaska cruise loved ho...
4    girlfriends stayed celebrate th birthdays plan...
Name: Description, dtype: object

# Common word removal or Keeping it

# let’s check the most frequently occurring words in our text data then take call to remove or retain . In which I have taken the 304 words as the total words is 92530 and square root of it 

In [33]:
freq = pd.Series(' '.join(dataf['Description']).split()).value_counts()[:304]
freq.head()

hotel    124947
room     108111
stay      46365
great     44817
staff     42681
dtype: int64

# Now Keep it from the Freq

In [34]:
freq = list(freq.index)
dataf['Description'] = dataf['Description'].apply(lambda x: " ".join(x for x in x.split() if x in freq))
dataf['Description'].head()

0    room clean ok stay youre would staying price r...
1    stayed staff friendly food restaurant little h...
2    booked hotel price could find got front desk m...
3    stayed husband way loved hotel great experienc...
4    stayed th weekend back looking us rooms rate s...
Name: Description, dtype: object

# Rare words removal

# Similarly, just as we removed the most common words, this time let’s remove rarely occurring words from the text. Because they’re so rare, the association between them and other words is dominated by noise. You can replace rare words with a more general form and then this will have higher counts

In [35]:
freq = pd.Series(' '.join(dataf['Description']).split()).value_counts()[-152:]
freq.head()

long       5900
across     5898
weekend    5886
problem    5884
rate       5848
dtype: int64

# Now Removing

In [36]:
freq = list(freq.index)
dataf['Description'] = dataf['Description'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
dataf['Description'].head()

0    room clean stay would staying price right brea...
1    stayed staff friendly food restaurant little p...
2    booked hotel price could find got front desk u...
3    stayed way hotel great experience room best fl...
4    stayed th back us rooms stayed nights parking ...
Name: Description, dtype: object

# Importing Text blob for Spelling correction

In [37]:
dataf['Description']

0        room clean stay would staying price right brea...
1        stayed staff friendly food restaurant little p...
2        booked hotel price could find got front desk u...
3        stayed way hotel great experience room best fl...
4        stayed th back us rooms stayed nights parking ...
5        rooms one nice clean bed old great location et...
6        stayed hotel times though hotel walk clean sta...
7        stayed city back found little hotel location p...
8        stayed trip san could recommend hotel called h...
9        wonderful staff great location price hotel fre...
10             times square nice rooms stayed nights great
11       stayed really nice hotel great location downto...
12       stay location never hotel walk get walking als...
13       recommend comfortable staff room clean really ...
14       found hotel clean located good free airport ro...
15       stayed th th much day trip another nights hote...
16       us hotel night night old hotel staff us room t.

In [None]:
from textblob import TextBlob
dataf['Description']=dataf['Description'][:68336].apply(lambda x: str(TextBlob(x).correct()))

# Tokenization  refers to dividing the text into a sequence of words or sentences.---Other way if Sentiment Analysis is not working we can use this

In [36]:
corpus = dataf['Description']
lables = dataf['Browser_Used']
corpus = np.array(corpus)
corpus

array(['room clean stay would staying price right breakfast free better',
       'stayed staff friendly food restaurant little pool little room floor two comfortable beds one tv little small small bit area could little every day never one floor work staff one door two sure room little would stay business would',
       'booked hotel price could find got front desk us room little would booked room would told rooms get go great got room room great location much say hotel room small bathroom small bathroom also staff parking day best also breakfast lobby small old coffee lobby service small rooms small view great location distance better',
       ...,
       'stayed nights little friendly staff asked times everything room large clean quiet comfortable good place stay stay',
       'stayed best could everything clean friendly trip nyc',
       'comfortable every way rooms good nice room really service room quiet location arrived front desk also took small bathroom hotel located quiet nyc s

In [38]:
corpus_df = pd.DataFrame({'Description': corpus, 
                          'Browser_Used': lables})
corpus_df = corpus_df[['Description', 'Browser_Used']]
corpus_df

Unnamed: 0,Description,Browser_Used
0,room clean stay would staying price right brea...,Edge
1,stayed staff friendly food restaurant little p...,Internet Explorer
2,booked hotel price could find got front desk u...,Mozilla
3,stayed way hotel great experience room best fl...,InternetExplorer
4,stayed th back us rooms stayed nights parking ...,Edge
5,rooms one nice clean bed old great location et...,InternetExplorer
6,stayed hotel times though hotel walk clean sta...,Firefox
7,stayed city back found little hotel location p...,Google Chrome
8,stayed trip san could recommend hotel called h...,Internet Explorer
9,wonderful staff great location price hotel fre...,Chrome


In [39]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [40]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['room clean stay would staying price right breakfast free better',
       'stayed staff friendly food restaurant little pool little room floor two comfortable beds one tv little small small bit area could little every day never one floor work staff one door two sure room little would stay business would',
       'booked hotel price could find got front desk us room little would booked room would told rooms get go great got room room great location much say hotel room small bathroom small bathroom also staff parking day best also breakfast lobby small old coffee lobby service small rooms small view great location distance better',
       ...,
       'stayed nights little friendly staff asked times everything room large clean quiet comfortable good place stay stay',
       'stayed best could everything clean friendly trip nyc',
       'comfortable every way rooms good nice room really service room quiet location arrived front desk also took small bathroom hotel located quiet nyc s

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 2],
       [0, 2, 0, ..., 0, 0, 2],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
# get all unique words in the corpus
vocab = cv.get_feature_names()
# show document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,airport,also,always,another,area,around,arrived,asked,away,back,...,want,wasnt,water,way,well,went,within,wonderful,work,would
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2
2,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,3
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# TF-IDF Model

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,airport,also,always,another,area,around,arrived,asked,away,back,...,want,wasnt,water,way,well,went,within,wonderful,work,would
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.24
1,0.00,0.00,0.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.15,0.17
2,0.00,0.19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.15
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.22,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.14,0.00,0.00,0.00,0.00,0.00,0.11,...,0.00,0.00,0.00,0.00,0.10,0.00,0.00,0.14,0.00,0.23
5,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.11
6,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.31,0.00,0.00,0.00,0.00
7,0.00,0.00,0.00,0.00,0.00,0.14,0.00,0.00,0.00,0.12,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.16,0.00,0.09
8,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.27,0.00,0.00
9,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.30,0.00,0.00


In [50]:
dataf['Description'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (0.388095238095, 0.633928571429)
1          (-0.01625, 0.548888888889)
2            (0.2125, 0.457692307692)
3                      (0.95, 0.7625)
4    (0.404166666667, 0.508333333333)
Name: Description, dtype: object

In [51]:
dataf['sentiment'] = dataf['Description'].apply(lambda x: TextBlob(x).sentiment[0] )
dataf[['Description','sentiment']].head()

Unnamed: 0,Description,sentiment
0,room clean stay would staying price right brea...,0.388095
1,stayed staff friendly food restaurant little p...,-0.01625
2,booked hotel price could find got front desk u...,0.2125
3,stayed way hotel great experience room best fl...,0.95
4,stayed th back us rooms stayed nights parking ...,0.404167


In [52]:
dataf

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,source,word_count,char_count,avg_word,stopwords,SpecialChar,numerics,upper,sentiment
0,id10326,room clean stay would staying price right brea...,Edge,Mobile,not happy,train,46,248,4,23,0,0,1,0.388095
1,id10327,stayed staff friendly food restaurant little p...,Internet Explorer,Mobile,not happy,train,208,1077,4,82,6,0,8,-0.016250
2,id10328,booked hotel price could find got front desk u...,Mozilla,Tablet,not happy,train,229,1327,4,91,0,0,9,0.212500
3,id10329,stayed way hotel great experience room best fl...,InternetExplorer,Desktop,happy,train,93,502,4,36,0,0,0,0.950000
4,id10330,stayed th back us rooms stayed nights parking ...,Edge,Tablet,not happy,train,294,1613,4,127,8,0,7,0.404167
5,id10331,rooms one nice clean bed old great location et...,InternetExplorer,Desktop,happy,train,112,610,4,50,4,0,3,0.418519
6,id10332,stayed hotel times though hotel walk clean sta...,Firefox,Tablet,not happy,train,95,492,4,51,1,0,1,0.283333
7,id10333,stayed city back found little hotel location p...,Google Chrome,Mobile,happy,train,164,935,4,69,2,0,4,0.167708
8,id10334,stayed trip san could recommend hotel called h...,Internet Explorer,Desktop,happy,train,117,641,4,48,1,0,5,0.366071
9,id10335,wonderful staff great location price hotel fre...,Chrome,Tablet,not happy,train,58,358,5,20,1,0,0,0.683333


In [82]:
##dataf.groupby("Browser_Used")['Is_Response','sentiment'].head(5)

In [67]:
dataf.Browser_Used.unique()

array(['Edge', 'Internet Explorer', 'Mozilla', 'InternetExplorer',
       'Firefox', 'Google Chrome', 'Chrome', 'IE', 'Opera',
       'Mozilla Firefox', 'Safari'], dtype=object)

In [73]:
dataf["Browser_Used"] = dataf.Browser_Used.replace({'Internet Explorer':'IE','InternetExplorer':'IE'})

In [74]:
dataf.Browser_Used.unique()

array(['Edge', 'IE', 'Mozilla', 'Firefox', 'Google Chrome', 'Chrome',
       'Opera', 'Mozilla Firefox', 'Safari'], dtype=object)

In [75]:
dataf["Browser_Used"] = dataf.Browser_Used.replace({'Mozilla':'Firefox','Mozilla Firefox':'Firefox'})

In [76]:
dataf.Browser_Used.unique()

array(['Edge', 'IE', 'Firefox', 'Google Chrome', 'Chrome', 'Opera',
       'Safari'], dtype=object)

In [77]:
dataf["Browser_Used"] = dataf.Browser_Used.replace({'Google Chrome':'Chrome'})

In [78]:
dataf.Browser_Used.unique()

array(['Edge', 'IE', 'Firefox', 'Chrome', 'Opera', 'Safari'], dtype=object)

In [87]:
dataf.Device_Used.unique()

array(['Mobile', 'Tablet', 'Desktop'], dtype=object)

In [86]:
dataf

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,source,word_count,char_count,avg_word,stopwords,SpecialChar,numerics,upper,sentiment
0,id10326,room clean stay would staying price right brea...,Edge,Mobile,not happy,train,46,248,4,23,0,0,1,0.388095
1,id10327,stayed staff friendly food restaurant little p...,IE,Mobile,not happy,train,208,1077,4,82,6,0,8,-0.016250
2,id10328,booked hotel price could find got front desk u...,Firefox,Tablet,not happy,train,229,1327,4,91,0,0,9,0.212500
3,id10329,stayed way hotel great experience room best fl...,IE,Desktop,happy,train,93,502,4,36,0,0,0,0.950000
4,id10330,stayed th back us rooms stayed nights parking ...,Edge,Tablet,not happy,train,294,1613,4,127,8,0,7,0.404167
5,id10331,rooms one nice clean bed old great location et...,IE,Desktop,happy,train,112,610,4,50,4,0,3,0.418519
6,id10332,stayed hotel times though hotel walk clean sta...,Firefox,Tablet,not happy,train,95,492,4,51,1,0,1,0.283333
7,id10333,stayed city back found little hotel location p...,Chrome,Mobile,happy,train,164,935,4,69,2,0,4,0.167708
8,id10334,stayed trip san could recommend hotel called h...,IE,Desktop,happy,train,117,641,4,48,1,0,5,0.366071
9,id10335,wonderful staff great location price hotel fre...,Chrome,Tablet,not happy,train,58,358,5,20,1,0,0,0.683333


In [91]:
Browser_dummy=pd.get_dummies(dataf.Browser_Used,prefix = "Browser")

In [92]:
dataf = pd.concat([dataf,Browser_dummy],axis = 1)

In [95]:
dataf=dataf.drop('Browser_Used',axis=1)

In [96]:
dataf.head()

Unnamed: 0,User_ID,Description,Device_Used,Is_Response,source,word_count,char_count,avg_word,stopwords,SpecialChar,numerics,upper,sentiment,Browser_Chrome,Browser_Edge,Browser_Firefox,Browser_IE,Browser_Opera,Browser_Safari
0,id10326,room clean stay would staying price right brea...,Mobile,not happy,train,46,248,4,23,0,0,1,0.388095,0,1,0,0,0,0
1,id10327,stayed staff friendly food restaurant little p...,Mobile,not happy,train,208,1077,4,82,6,0,8,-0.01625,0,0,0,1,0,0
2,id10328,booked hotel price could find got front desk u...,Tablet,not happy,train,229,1327,4,91,0,0,9,0.2125,0,0,1,0,0,0
3,id10329,stayed way hotel great experience room best fl...,Desktop,happy,train,93,502,4,36,0,0,0,0.95,0,0,0,1,0,0
4,id10330,stayed th back us rooms stayed nights parking ...,Tablet,not happy,train,294,1613,4,127,8,0,7,0.404167,0,1,0,0,0,0


In [97]:
dataf["Is_Response"] = dataf.Is_Response.replace({'happy':1,'not happy':0})

In [114]:
dataf["source"] = dataf.source.replace({'train':1,'test':0})

In [115]:
dataf.head()

Unnamed: 0,User_ID,Description,Is_Response,source,word_count,char_count,avg_word,stopwords,SpecialChar,numerics,...,sentiment,Browser_Chrome,Browser_Edge,Browser_Firefox,Browser_IE,Browser_Opera,Browser_Safari,Device_Desktop,Device_Mobile,Device_Tablet
0,id10326,room clean stay would staying price right brea...,0.0,1,46,248,4,23,0,0,...,0.388095,0,1,0,0,0,0,0,1,0
1,id10327,stayed staff friendly food restaurant little p...,0.0,1,208,1077,4,82,6,0,...,-0.01625,0,0,0,1,0,0,0,1,0
2,id10328,booked hotel price could find got front desk u...,0.0,1,229,1327,4,91,0,0,...,0.2125,0,0,1,0,0,0,0,0,1
3,id10329,stayed way hotel great experience room best fl...,1.0,1,93,502,4,36,0,0,...,0.95,0,0,0,1,0,0,1,0,0
4,id10330,stayed th back us rooms stayed nights parking ...,0.0,1,294,1613,4,127,8,0,...,0.404167,0,1,0,0,0,0,0,0,1


In [117]:
Device_dummy=pd.get_dummies(dataf.Device_Used,prefix = "Device")

In [100]:
dataf = pd.concat([dataf,Device_dummy],axis = 1)

In [101]:
dataf=dataf.drop('Device_Used',axis=1)

In [118]:
dataf.head()

Unnamed: 0,User_ID,Description,Is_Response,source,word_count,char_count,avg_word,stopwords,SpecialChar,numerics,...,sentiment,Browser_Chrome,Browser_Edge,Browser_Firefox,Browser_IE,Browser_Opera,Browser_Safari,Device_Desktop,Device_Mobile,Device_Tablet
0,id10326,room clean stay would staying price right brea...,0.0,1,46,248,4,23,0,0,...,0.388095,0,1,0,0,0,0,0,1,0
1,id10327,stayed staff friendly food restaurant little p...,0.0,1,208,1077,4,82,6,0,...,-0.01625,0,0,0,1,0,0,0,1,0
2,id10328,booked hotel price could find got front desk u...,0.0,1,229,1327,4,91,0,0,...,0.2125,0,0,1,0,0,0,0,0,1
3,id10329,stayed way hotel great experience room best fl...,1.0,1,93,502,4,36,0,0,...,0.95,0,0,0,1,0,0,1,0,0
4,id10330,stayed th back us rooms stayed nights parking ...,0.0,1,294,1613,4,127,8,0,...,0.404167,0,1,0,0,0,0,0,0,1


In [122]:
data_column_category = dataf.select_dtypes(exclude=[np.number]).columns

In [123]:
data_column_Integer = dataf.columns.difference(data_column_category)

In [124]:
dataf[data_column_category].nunique()

User_ID        68336
Description    68315
dtype: int64

In [125]:
dataf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68336 entries, 0 to 29403
Data columns (total 21 columns):
User_ID            68336 non-null object
Description        68336 non-null object
Is_Response        38932 non-null float64
source             68336 non-null int64
word_count         68336 non-null int64
char_count         68336 non-null int64
avg_word           68336 non-null int64
stopwords          68336 non-null int64
SpecialChar        68336 non-null int64
numerics           68336 non-null int64
upper              68336 non-null int64
sentiment          68336 non-null float64
Browser_Chrome     68336 non-null uint8
Browser_Edge       68336 non-null uint8
Browser_Firefox    68336 non-null uint8
Browser_IE         68336 non-null uint8
Browser_Opera      68336 non-null uint8
Browser_Safari     68336 non-null uint8
Device_Desktop     68336 non-null uint8
Device_Mobile      68336 non-null uint8
Device_Tablet      68336 non-null uint8
dtypes: float64(2), int64(8), object(2), uint

In [126]:
data_model= pd.concat([dataf["User_ID"],dataf[data_column_Integer]],axis=1)

In [127]:
data_model.shape

(68336, 20)

In [139]:
train_modified = data_model.loc[data_model.source==1,:]

In [140]:
test_modified = data_model.loc[data_model.source==0.0,:]

In [141]:
test_modified.shape

(29404, 20)

In [142]:
train_modified.shape

(38932, 20)

In [143]:
test_modified.columns

Index([u'User_ID', u'Browser_Chrome', u'Browser_Edge', u'Browser_Firefox',
       u'Browser_IE', u'Browser_Opera', u'Browser_Safari', u'Device_Desktop',
       u'Device_Mobile', u'Device_Tablet', u'Is_Response', u'SpecialChar',
       u'avg_word', u'char_count', u'numerics', u'sentiment', u'source',
       u'stopwords', u'upper', u'word_count'],
      dtype='object')

In [144]:
test_modified.drop(columns=['source','Is_Response'],inplace=True)

In [145]:
test_modified.shape

(29404, 18)

In [146]:
train_modified.drop(columns=['source'],inplace=True)

In [147]:
train_modified.shape

(38932, 19)

In [148]:
train_modified.to_csv("train_ready_for_model.csv",index=False)

In [149]:
test_modified.to_csv("test_ready_for_model.csv",index=False)

# Modeling