# Method Followed
1. Extract the company name
2. Pick a company and work on it.
3. Take all the inbound tweets which are first in the conversation. (i.e. in_response_to is null) and classify them into various topics using LDA
4. Find the topic for each tweet


In [295]:
import pandas as pd
import numpy as np

import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords

from nltk.stem.wordnet import WordNetLemmatizer
import string


twcs = pd.read_csv('twcs.csv')

In [299]:
#Get all the tweets which are inbound and extract the company using tweet 'text' - This gives us ~7.5 lakh tweets
ibnd = twcs.query('inbound==True and in_response_to_tweet_id.isnull()').reset_index(drop=True)
ibnd['company'] = ibnd['text'].apply(lambda x : None if len(re.findall("@[0-9a-zA-Z_]+", x))==0 else re.findall("@[0-9a-zA-Z_]+", x)[0][1:])

#For tweets whose company is null from above(~50k tweets), find their company by using the below logic.
#Logic: find the tweets which have respond to this tweet and take the author_id of the first tweet. 

ibnd['response_tweet_id_1'] = ibnd.apply(lambda x : None if pd.notnull(x['company']) or pd.isnull(x['response_tweet_id']) else [int(i) for i in x['response_tweet_id'].split(',')][0], axis=1  )
ibnd = ibnd.merge(twcs[['tweet_id', 'author_id']], how='left', left_on='response_tweet_id_1', right_on='tweet_id')
ibnd['company'] = ibnd.apply(lambda x : x['company'] if pd.notnull(x['company']) else x['author_id_y'], axis=1)

#Drop the tweets whose company can't be found even after apply the above logic. <500 tweets are dropped
ibnd = ibnd[pd.notnull(ibnd['company'])][['text', 'company']]

#Making a copy of ibnd
ibnd_copy = ibnd.copy()

In [302]:
#Clean the tweets for Apple and find the topics

#Select Apple tweets
ibnd = ibnd_copy.query('company=="AppleSupport"').reset_index(drop=True)

#remove @ and # words
ibnd['text'] = ibnd['text'].apply(lambda x : re.sub('[@#][0-9a-zA-Z_]+', '', x))

#Remove URLs
ibnd['text'] = ibnd['text'].apply(lambda x: re.sub('https?://\S+|www\.\S+', '', x))

#Removing new line character
ibnd['text'] = ibnd['text'].apply(lambda x: x.replace("\n", ""))


#Remove punctuation, stop words and lemmatize
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

text_clean = [clean(text).split() for text in ibnd['text'].values]


In [303]:
import gensim
from gensim import corpora

#Term dictionary
dictionary = corpora.Dictionary(text_clean)

#Document-Term matrix
doc_term_matrix = [dictionary.doc2bow(text) for text in text_clean]

# Lda model
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=50)

print(ldamodel.print_topics(num_topics=10, num_words=10))

[(0, '0.045*"app" + 0.024*"help" + 0.020*"can’t" + 0.019*"update" + 0.018*"message" + 0.016*"iphone" + 0.016*"apps" + 0.014*"please" + 0.013*"open" + 0.013*"download"'), (1, '0.114*"i️" + 0.072*"fix" + 0.024*"letter" + 0.023*"please" + 0.023*"type" + 0.017*"glitch" + 0.017*"keyboard" + 0.016*"it" + 0.015*"issue" + 0.015*"this"'), (2, '0.072*"io" + 0.053*"iphone" + 0.042*"battery" + 0.041*"update" + 0.026*"phone" + 0.025*"6" + 0.024*"11" + 0.023*"since" + 0.016*"new" + 0.016*"7"'), (3, '0.051*"apple" + 0.037*"iphone" + 0.029*"x" + 0.017*"watch" + 0.014*"help" + 0.014*"phone" + 0.013*"get" + 0.012*"new" + 0.011*"store" + 0.011*"one"'), (4, '0.034*"photo" + 0.031*"macbook" + 0.022*"icloud" + 0.021*"mac" + 0.021*"ipad" + 0.021*"sierra" + 0.021*"pro" + 0.020*"high" + 0.012*"help" + 0.011*"picture"'), (5, '0.064*"phone" + 0.037*"keep" + 0.029*"screen" + 0.023*"time" + 0.022*"iphone" + 0.019*"update" + 0.013*"new" + 0.013*"every" + 0.012*"it’s" + 0.011*"even"'), (6, '0.042*"apple" + 0.029*"mu

[(0, '0.115*"iphone" + 0.029*"x" + 0.025*"7" + 0.024*"plus" + 0.022*"issue" + 0.021*"8" + 0.020*"6" + 0.019*"screen" + 0.017*"anyone" + 0.014*"problem"'), (1, '0.066*"time" + 0.052*"every" + 0.043*"screen" + 0.027*"phone" + 0.017*"“i”" + 0.016*"keep" + 0.014*"lock" + 0.013*"home" + 0.010*"go" + 0.010*"“it”"'), (2, '0.046*"app" + 0.038*"apple" + 0.030*"music" + 0.024*"can’t" + 0.016*"work" + 0.016*"call" + 0.013*"download" + 0.012*"store" + 0.012*"doesn’t" + 0.011*"apps"'), (3, '0.048*"watch" + 0.031*"de" + 0.025*"apple" + 0.019*"que" + 0.019*"charger" + 0.017*"hell" + 0.016*"told" + 0.016*"la" + 0.012*"3" + 0.012*"se"'), (4, '0.105*"i️" + 0.067*"fix" + 0.022*"phone" + 0.022*"letter" + 0.021*"type" + 0.020*"question" + 0.019*"please" + 0.018*"shit" + 0.018*"going" + 0.017*"box"'), (5, '0.038*"macbook" + 0.027*"glitch" + 0.026*"sierra" + 0.025*"pro" + 0.025*"high" + 0.018*"ipad" + 0.017*"mac" + 0.017*"working" + 0.014*"keyboard" + 0.013*"explain"'), (6, '0.026*"help" + 0.024*"apple" + 0.

In [293]:
#Finding the topic for each tweet

threshold = 0.3

topics = []

def get_topic(bow):
    probs = ldamodel.get_document_topics(dictionary.doc2bow(bow))
    topic = sorted(probs, key=lambda x : x[1], reverse=True)[0]
    return topic[0] if topic[1]>threshold else 100

for bow in text_clean:
    topics.append(get_topic(bow))
    
pd.Series(topics).value_counts()

8      11356
6       9372
4       6234
2       4099
100     3278
0       3002
7       2332
1       1635
5       1321
9       1038
3        855
dtype: int64