In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('DomainCTrain.csv')

In [3]:
df.head()

Unnamed: 0,message,food,recharge,support,reminders,travel,nearby,movies,casual,other
0,7am everyday,F,F,F,T,F,F,F,F,F
1,chocolate cake,T,F,F,F,F,F,F,F,F
2,closed mortice and tenon joint door dimentions,F,F,T,F,F,F,F,F,F
3,train eppo kelambum,F,F,F,F,T,F,F,F,F
4,yesterday i have cancelled the flight ticket,F,F,F,F,T,F,F,F,F


In [4]:
def label_race (row):
    if row['food'] == "T":
        return 'food'
    elif row['recharge'] == "T":
        return 'recharge'
    elif row['support'] == "T":
        return 'support'
    elif row['reminders'] == "T":
        return 'reminders'
    elif row['travel'] == "T":
        return 'travel'
    elif row['nearby'] == "T":
        return 'nearby'
    elif row['movies'] == "T":
        return 'movies'
    elif row['casual'] == "T":
        return 'casual'
    else:
        return "other"

In [5]:
df['category'] = df.apply(lambda row: label_race(row), axis=1)

df = df.drop(['food', 'recharge', 'support', 'reminders', 'nearby', 'movies', 'casual', 'other', 'travel'],axis=1)

In [6]:
df.head()

Unnamed: 0,message,category
0,7am everyday,reminders
1,chocolate cake,food
2,closed mortice and tenon joint door dimentions,support
3,train eppo kelambum,travel
4,yesterday i have cancelled the flight ticket,travel


In [7]:
df.shape

(40659, 2)

In [8]:
df = df.groupby('category').apply(lambda x: x.sample(n=1000, random_state=0))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

all_text = df[['message']]

In [10]:
all_text['message'] = all_text['message'].str.lower()

In [11]:
tfidf = TfidfVectorizer(stop_words='english')

In [12]:
vectors = tfidf.fit_transform(all_text['message'])

In [13]:
X = vectors.toarray()

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['category'])

In [15]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.3, random_state=42)

In [16]:
log_reg = LogisticRegression(random_state=0)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_val)

log_accuracy = accuracy_score(y_val, y_pred)
print('Logistic accuracy:',log_accuracy)

Logistic accuracy: 0.7066666666666667


In [17]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_val)

nb_accuracy = accuracy_score(y_val, y_pred)
print('Naive bayes accuracy:',nb_accuracy)

Naive bayes accuracy: 0.7114814814814815


In [18]:
lvsm = LinearSVC(random_state=0)
lvsm.fit(X_train, y_train)

y_pred = lvsm.predict(X_val)

lvsm_accuracy = accuracy_score(y_val, y_pred)
print('LinearSVM accuracy:',lvsm_accuracy)

LinearSVM accuracy: 0.7125925925925926


In [19]:
#Validation of test data

df_test = pd.read_csv('DomainCTest.csv')
df_test["category"] = df_test.apply(lambda row: label_race(row),axis=1)

drop= ["food", "recharge", "support", "reminders", "nearby", "movies", "casual", "other", "travel"]
df_test=  df_test.drop(drop,1)

In [20]:
df_test.head()

Unnamed: 0,message,category
0,Nearest metro station,nearby
1,Pick up n drop service trough cab,travel
2,I wants to buy a bick,other
3,Show me pizza,food
4,What is the cheapest package to andaman and ni...,travel


In [22]:
all_text = df_test["message"].str.lower()

# Transforming using the tfidf object - tfidf
X_test = tfidf.transform(all_text).toarray()

# Transforming using label encoder object - le
y_test = le.transform(df_test["category"])

# Predicting using the logistic regression model - logreg
y_pred = log_reg.predict(X_test)
log_accuracy_2 = accuracy_score(y_test,y_pred)
print (str(log_accuracy_2)+(" is the accuracy of the logistic regression model"))

# Predicting using the naive bayes model - nb
y_pred = nb.predict(X_test)
nb_accuracy_2 = accuracy_score(y_test,y_pred)
print (str(nb_accuracy_2)+(" is the accuracy of the Naive Bayes model"))


# Predicting using the linear svm model - lsvm
y_pred = lvsm.predict(X_test)
lvsm_accuracy_2 = accuracy_score(y_test,y_pred)
print (str(lvsm_accuracy_2)+(" is the accuracy of the Support Vector model"))

0.77 is the accuracy of the logistic regression model
0.6839 is the accuracy of the Naive Bayes model
0.7604 is the accuracy of the Support Vector model


In [23]:
#LSI model

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim.models.lsimodel import LsiModel
from gensim import corpora
from pprint import pprint

In [24]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [25]:
list_of_docs = df["message"].tolist()

doc_clean = [clean(doc).split() for doc in list_of_docs]

In [26]:
dictionary = corpora.Dictionary(doc_clean)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

[(0,
  '0.347*"reminder" + 0.267*"like" + 0.267*"cancel" + 0.266*"would" + '
  '0.256*"offset" + 0.256*"apiname" + 0.256*"exotel" + 0.256*"userid" + '
  '0.255*"taskname" + 0.255*"reminderlist"'),
 (1,
  '0.831*"want" + 0.221*"u" + 0.187*"know" + 0.181*"movie" + 0.135*"book" + '
  '0.128*"ticket" + 0.114*"need" + 0.108*"hi" + 0.096*"please" + '
  '0.092*"service"'),
 (2,
  '-0.451*"reminder" + 0.328*"call" + 0.316*"u" + 0.233*"wake" + '
  '-0.205*"water" + 0.197*"march" + 0.192*"wakeup" + -0.185*"every" + '
  '-0.181*"drink" + -0.168*"want"'),
 (3,
  '-0.611*"u" + 0.418*"want" + -0.244*"need" + -0.238*"reminder" + '
  '-0.197*"please" + -0.143*"movie" + -0.117*"service" + 0.102*"wake" + '
  '-0.101*"near" + -0.101*"help"'),
 (4,
  '0.620*"need" + -0.510*"u" + 0.492*"movie" + 0.189*"offer" + -0.137*"want" + '
  '0.115*"ticket" + 0.058*"know" + 0.052*"today" + -0.051*"find" + '
  '0.050*"book"')]


In [27]:
#LDA model

from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task

# Function to calculate coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    topic_list : No. of topics chosen
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    topic_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(doc_term_matrix, random_state = 0, num_topics=num_topics, id2word = dictionary, iterations=10)
        topic_list.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values

In [28]:
topic_list, coherence_value_list = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=1, limit=41, step=5)
print(coherence_value_list)

max_index=coherence_value_list.index(max(coherence_value_list))

opt_topic= topic_list[max_index]
print("Optimum no. of topics:", opt_topic)

lda_model = LdaModel(corpus=doc_term_matrix, num_topics=opt_topic, id2word = dictionary, iterations=10, passes = 30,random_state=0)

lda_model.print_topic(1)

[0.3287476298674388, 0.4781725872247256, 0.46115894529180984, 0.5253656906944698, 0.5285224478360521, 0.5593474153069067, 0.5595998259747449, 0.5761708096054737]
Optimum no. of topics: 36


'0.276*"near" + 0.104*"location" + 0.077*"place" + 0.058*"restaurant" + 0.055*"timing" + 0.048*"me" + 0.029*"budget" + 0.014*"lunch" + 0.014*"issue" + 0.012*"finding"'