## Definición de tópicos

Grupo Play: Manuel Brito, Ezequiel Ortiz Recalde, Lucas Romeo

In [1]:
# Instalaciones necesarias para poder seguir el proceso

#!pip install langdetect
# correr en la terminal:
# python -m nltk.downloader averaged_perceptron_tagger

In [2]:
# Importamos los paquetes necesarios
import pandas as pd
import numpy as np
import re
import string
from langdetect import detect

import sklearn
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('influencer_1_120.csv')
df.shape

(10056, 11)

In [4]:
# Dropeamos observaciones con reacciones deshabilitadas
a = df[df['Reaction']=='Disable'].index
df.drop(a,inplace=True)

df.reset_index(drop=True,inplace=True)

In [5]:
df['Embedding'] = df['Embedding'].str.strip('[]')
df['hashtag_q'] = df['Embedding'].str.findall(r'(hashtag)').map(len)
df['company_q'] = df['Embedding'].str.findall(r'(/company/)').map(len)
df['person_q'] = df['Embedding'].str.findall(r'(/in/|lnkd)').map(len)
df.loc[df['Image'] != '0','Image'] = 1
df.loc[df['LinkedIn_Video'] != '0','LinkedIn_Video'] = 1
df.loc[df['External_Video'] != '0','External_Video'] = 1
df.loc[df['Podcast'] != '0','Podcast'] = 1
df.loc[df['Article'] != '0','Article'] = 1
df.loc[df['Comment'] == 'Disable','Comment'] = '0'
df['Comment'] = df['Comment'].str.extract(r'(\d+)')
df['Comment'] = df['Comment'].str.replace(r'.','')
df['Comment'] = df['Comment'].astype(int)
df['Followers'] = df['Followers'].str.replace(r',','')
df['Followers'] = df['Followers'].str.replace(r'.','')
df['Followers'] = df['Followers'].astype(int)
df.loc[df['Reaction'] == 'Disable','Reaction'] = '0'
df['Reaction'] = df['Reaction'].str.replace(r'.','')
df['Reaction'] = df['Reaction'].str.replace(r',','')
df['Reaction'] = df['Reaction'].astype(int)

modelo = ['Name', 'Followers', 'Header', 'Comment',
              'Reaction', 'Image', 'LinkedIn_Video', 'External_Video', 'Podcast',
              'Article', 'hashtag_q', 'company_q', 'person_q']
df = df[modelo]

df.columns = ['Name', 'Followers', 'Text', 'Comment',
              'Reaction', 'Image', 'LinkedIn_Video', 'External_Video', 'Podcast',
              'Article', 'hashtag_q', 'company_q', 'person_q']

## 1. Limpieza del texto

### Filtro de idiomas que no sean inglés

In [6]:
lang = []
indices = []
for i in df['Text']:
    try:
        lang.append(detect(i))
    except:
        lang.append(0)

In [7]:
# Filtro de posts que no están en ingles
df['lang'] = lang
other_lang = df[(df['lang']!=0) & (df['lang']!='en') & (df['lang']!='da') & (df['lang']!='ro') & (df['lang']!='ca')].index

df.drop(other_lang,inplace=True)
df.reset_index(drop=True,inplace=True)

# Filtro adicional para palabras en francés que el detector no identificó
df['validation']=df['Text'].str.extract(r'(joly|très|plaisir|dernier|Politique|pandémie|Mondiale|Economique|Tout|Quelle|depuis|Pour|fais|stratégie|suivez|Fais|cette|assez|vois|avec|Avec|Propriétaires|Vous| il y a|Pendant)')
french = df.loc[df['validation'].notnull()].index
df.drop(french,inplace=True)
df.reset_index(drop=True,inplace=True)

In [8]:
df.shape

(9587, 15)

In [9]:
# Consolidación de expresiones
df['Text'] = df['Text'].str.replace(r'(COVID19|Covid-19|COVID-19|covid19|covid-19|COVID-19)','COVID')
df['Text'] = df['Text'].str.replace(r'(working)','work')

## Buscamos los sustantivos y adjetivos, sacamos stopwords y lematizamos:

In [10]:
string.punctuation = string.punctuation + '—’'
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word.lower() for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

def text_process(text):
    stop_words = stopwords.words('english') + ['us','new','get','one','many','want','de','en','c','please',\
                                               'thank','thank','thanks','think','news',\
                                               'great','powerful','good',\
                                               'amazing','youre','ive','im','la','joly','de','en',\
                                               'year','today','day','also',\
                                               'month','would','will','paypal','could','wont'\
                                               'excited','dont',\
                                               "we're",'it’s','i’m','—','-','1',\
                                               '2','3','4','5','6','7','8','9','10',\
                                               'tomorrow','monday','tuesday','wednesay','thursday',\
                                               'friday','saturday','sunday','going','everyone','join',\
                                               'others','know','made','make','like','see','take','thats',\
                                               'well','way','week','look','looking','come','linkedin']
    
    nopunc = [char for char in text if char not in string.punctuation] 
    nopunc = ''.join(nopunc)
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in set(stop_words)])

In [11]:
df['clean_Text'] = df['Text'].apply(nouns_adj)
df['clean_Text'] = df['Text'].apply(text_process)
df['clean_Text'] = df['clean_Text'].apply(lambda x: ' '.join(WordNetLemmatizer().lemmatize(term) for term in x.split()))
df['clean_Text'] = df['clean_Text'].str.replace(r'(leadership)','leader')

## 2. Matriz de tokens y LDA
Reducimos la dimensionalidad requiriendo que los tokens aparezcan en al menos el 3% del total de todos los posts

In [12]:
vectorizer = CountVectorizer(analyzer='word',min_df=0.03,token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized = vectorizer.fit_transform(df['clean_Text'])

In [13]:
# Probamos distintos learning_decay y numeros de componentes
params = {'learning_decay': [.5, .7, .9],'n_components': [3,4,5]}
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
model = GridSearchCV(lda, param_grid=params)
model.fit(data_vectorized)      

GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [3, 4, 5]})

In [14]:
# Mejor modelo
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 3}
Model Perplexity:  66.2533271514195


Alcanzamos el mejor resultado con 3 componentes. Procedemos a ver su composición

In [15]:
# Document Matrix a dataframe
lda_output = best_lda_model.transform(data_vectorized)
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
docnames = [str(i) for i in range(len(df['Text']))]
adf_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
dominant_topic = np.argmax(adf_document_topic.values, axis=1)
adf_document_topic['dominant_topic'] = dominant_topic

## Top N Words per topic

In [16]:
# Top N Palabras por topico
def show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)

# Topic words df
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,leader,woman,team,work,ceo,community,forward,support,together,live,opportunity,excited,read,industry,career
Topic 1,time,business,company,world,year,first,pandemic,global,technology,important,love,thing,right,every,work
Topic 2,people,help,work,share,book,need,change,future,learn,next,employee,health,better,best,covid


A simple vista, consideramos que se podrían etiquetar los tópicos de la siguiente forma:
- Topic 0: Tendencias
- Topic 1: Negocios
- Topic 2: Responsabilidad social

In [17]:
# Topic words matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Columna e índice
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
df_topic_keywords.columns

Index(['2020', 'around', 'back', 'best', 'better', 'book', 'business',
       'career', 'ceo', 'challenge', 'change', 'community', 'company',
       'conversation', 'country', 'covid', 'customer', 'economy', 'employee',
       'even', 'every', 'excited', 'experience', 'feel', 'find', 'first',
       'forward', 'friend', 'future', 'global', 'group', 'health', 'help',
       'important', 'industry', 'job', 'last', 'leader', 'learn', 'life',
       'live', 'love', 'much', 'need', 'next', 'opportunity', 'organization',
       'pandemic', 'part', 'partner', 'people', 'proud', 'question', 'read',
       'right', 'role', 'share', 'support', 'team', 'technology', 'thing',
       'thought', 'time', 'together', 'virtual', 'woman', 'work', 'world',
       'year'],
      dtype='object')

## 3. Consolidación de resultados

Dada la configuración de la matriz de tokens, era de esperar que algunas observaciones con palabras poco frecuentes no posean un tópico definido. A continuación las descartamos

In [18]:
# Conservamos los índices de las observaciones con tópico indefinido
x = adf_document_topic[(adf_document_topic['Topic0']==0.33)&(adf_document_topic['Topic1']==0.33)].index

In [19]:
df_topics = pd.DataFrame(adf_document_topic)
df_topics.reset_index(inplace=True)

In [20]:
df1 = df
df1.reset_index(inplace=True)

In [21]:
df_topics['index']=df_topics['index'].astype(int)
df_final = pd.merge(df1,df_topics,on='index')

In [22]:
columnas = ['Name', 'Followers', 'Text', 'Comment', 'Reaction', 'Image','LinkedIn_Video', 'External_Video', 'Podcast', 'Article',
            'hashtag_q','company_q', 'person_q','Topic0','Topic1', 'Topic2', 'dominant_topic']
df_final=df_final[columnas]

x = x.astype(int)

In [23]:
df_final.drop(x,inplace=True)
df_final.reset_index(drop=True,inplace=True)

In [24]:
# Feature que indica la cantidad de caracteres
df_final['text_len']=df_final['Text'].apply(len)

In [25]:
df_final['topic_proba']=df_final[['Topic0','Topic1','Topic2']].apply(max,axis=1)

In [26]:
df_final.columns
columnas = ['Name', 'Followers', 'Text', 'Comment', 'Reaction', 'Image',
            'LinkedIn_Video', 'External_Video', 'Podcast', 'Article', 'hashtag_q',
            'company_q', 'person_q', 'dominant_topic', 'topic_proba','text_len']

df_final=df_final[columnas]

### Asignación de etiquetas a tópicos

Asignamos las etiquetas a los tópicos en función de sus palabras más frecuentes, resaltando que esta no es la única forma de hacerlo.

In [27]:
df_final.loc[df_final['dominant_topic'] == 0,'dominant_topic']= 'tendencias'
df_final.loc[df_final['dominant_topic'] == 1,'dominant_topic']= 'negocios'
df_final.loc[df_final['dominant_topic'] == 2,'dominant_topic']= 'responsabilidad social'

# 4. Dataset final

Guardamos el resultado en un dataset para poder utilizarlo en la siguiente parte del proceso: el armado de clusters y su caracterización

In [28]:
#df_final = pd.read_csv('dataset3.csv')