<a href="https://colab.research.google.com/github/pfescriva/Applied-AI-for-Business-Insight/blob/main/TFM_Pere.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Code to load libraries:

# !pip install mglearn
# !pip install gensim
# !pip install stop_words
# !pip install pyLDAvis
# !pip install langdetect
# !pip install unidecode

import sklearn as sk
import pandas as pd 
import numpy as np 
import mglearn
from stop_words import get_stop_words
import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.sklearn
import warnings
from gensim.utils import simple_preprocess
from gensim import utils 
from collections import Counter
import matplotlib.pyplot as plt; plt.rcdefaults()
import matplotlib.pyplot as plt
from langdetect import detect
import spacy
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel


In [5]:

import pandas as pd

path = '/content/drive/MyDrive/TFM data/urjc.jsonl.gz'
data = pd.read_json(path, lines = True)


In [None]:

# 0. Some initial cleaning (Empty cols and constant cols)
data.dropna(how = 'all', axis = 1, inplace = True) # Drop empty columns
data = data.loc[:, (data != data.iloc[0]).any()]  # Drop Constant columns
print(data.shape[0])

# Filter out non-spanish documents
data = data.loc[(data['lang'] == "es")].reset_index(drop  = True)
print('First lang filter: ' + str(data.shape[0]))

# Filter out non-spanish documents with higher precission.
data['Language'] = data['text'].apply(detect)
data = data.loc[(data['Language'] == "es")].reset_index(drop  = True)
print('First lang filter: ' + str(data.shape[0]))

# Keep only non RT information
data = data.loc[ ~ (data['text'].str.startswith('RT '))]
print('RT removal: ' + str(data.shape[0]))

# 1. Drop duplicated documents that are writen by the same user, and laso drop any tweet that is NA (There were not anyway)
data = data.drop_duplicates(subset = ['author_id', 'text'], keep = 'first', inplace = False, ignore_index = False).reset_index(drop = True)
data = data.dropna(subset = ['text'])
print('Dropped duplicates and empty texts: ' + str(data.shape[0]))



In [7]:
data.to_pickle("clean_data.pkl")

In [9]:
# data = pd.read_pickle("clean_data.pkl")
data = data[data['in_reply_to_user_id'].isnull()]

In [None]:

import os
os.getcwd()


In [26]:

"""

PRE-PROCESS: 

0. Get the unduplicated / non-retweeted tweets. 


NON-TUNED PRE-PROCESSING STEPS:

1. Remove the hashtags, links and shares

2. Lowcase 

3. Remove punctuation 

4. Remove accents 

5. Remove numbers (TBC)

6. Lemmatise 

7. Make some corrections uncovered above


TUNED STEPS:

8. Remove stopwords (TBC properly) 

9. Remove short tweets 

10. Group by author and gridsearch best LDA hyperparameters.  

"""


# 1. Remove links, hashtags and mentions
# --------------------------------------

def hashtag(txt):
    
    # Remove hashtags
    result = ' '.join(word for word in txt.split(' ') if not word.startswith('#'))
    
    # Remove shares
    result = ' '.join(word for word in result.split(' ') if not word.startswith('@'))
    # check if this doesn't work well use: result = re.sub(r'@\S+', '', result)
        
    # Remove some jajajas
    result = ' '.join(word for word in result.split(' ') if 'jaja' not in word.lower())
    result = ' '.join(word for word in result.split(' ') if 'jj' not in word.lower())
    result = ' '.join(word for word in result.split(' ') if 'haha' not in word.lower())

    # Remove links
    result = re.sub(r'http\S+', '', result)

    # Further work with potential remainings
    result = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", result)
    result = re.sub(r'[¿“‘?’"„“<>,!"]', "", result)
    result = re.sub('!', "", result)
    
    return result

    
data['text'] = data['text'].apply(hashtag)


# 2. Lowcase 
# --------------------------------------

data['text'] = data['text'].apply(np.char.lower)



# 3. Remove punctuation from text
# --------------------------------------

from gensim.parsing.preprocessing import strip_punctuation
data['text'] = data['text'].apply(strip_punctuation)



# 4. Remove accents from text
# --------------------------------------

import unidecode
data['text'] = data['text'].apply(unidecode.unidecode)
data['text'] = data['text'].apply(gensim.utils.deaccent) # In case we apply both 



# 5. Remove numbers
# --------------------------------------

data['text'] = data['text'].str.replace('\d+', '')



# 6. Lemmatisation
# --------------------------------------
# The purpose of this code is to lemmatize (Convert to dictionary form) the words in each document, 
# keeping the same format

nlp = spacy.load('es_core_news_sm', exclude = ['derechos', 'expres', 'ademas', 'traves', 'adios'], disable = ['derechos', 'expres', 'ademas', 'traves', 'adios'])

def lemmatizer(text):  
  doc = nlp(text)
  return ' '.join([word.lemma_ for word in doc])

data['lemmatized'] = data['text'].apply(lambda x: lemmatizer(x)) 



# 7. Further cleaning
# --------------------------------------

def corrector(text):     
    text = text.replace(' lumnos ', ' alumno ') 
    text = text.replace(' estudiante ', ' alumno ') # Synonims I think it's worth having them as the same word
    text = text.replace(' avda ', ' avenida ')
    text = text.replace(' info ', ' informacion ')
    text = text.replace(' uni ', ' universidad ')
    text = text.replace(' almerio ', ' almeria ')
    text = text.replace(' estudio ', ' estudiar ')
    text = text.replace(' cambier ', ' cambiar ')
    
    return text

data['lemmatized'] = data['lemmatized'].apply(corrector)

from gensim.parsing.preprocessing import strip_multiple_whitespaces
data['lemmatized'] = data['lemmatized'].apply(strip_multiple_whitespaces)



# 8. Stopwords work 
# --------------------------------------

# Find the list of stopwords

# Note that the removal depends on the library and will be part of each. 
stop_words = stopwords.words('spanish')
stop_words_extension = get_stop_words('es')
stop_words.extend(stop_words_extension)

# Remove accents from stopwords (I'll work with no accents anywhere, since we don't expect people to write always with accents) 
stop_words = [gensim.utils.deaccent(each_word) for each_word in stop_words]

stop_words.extend(['interesante', 'universidadjuancarlos', 'él', 'hala', 'juanca', 'reyjuancar', 'urjcritica', 'this', 'was', 'date', 'great', 'my', 'first', 'claro', 'queydondeestudiar2020', 'rey', 'juan', 'literalmente', 'universidadreyjuancarlo', 'universidadjuancarlos', 'lareyjuancarlos', 'universidadreyjuancarlos', 'universidadreyjuancarlo', 'urjc', 'gracia', 'gracias','ajjaj', 'xd', 'xdd', 'xdxd', 'Madrid', 'primero', 'segundo', 'tercero', 'cuarto', 'quinto', 'sexto', 'septimo', 'octavo', 't', 'a', 's', 'k', 'q', 'mas', 'tambien', 'ir', 'alla', 'cosa', 'iee', 'siquiera', 'in', 'the', 'of', 'ano', 'n', 'p', 'r', 'asi', 'coincidir', 'aa', 'decir', '¡', '¿', 'poder', 'podeis', 'estais', 'traves', 'alguno', 'buen', 'nosotrxs', 'bufff', 'buff', 'universitario', 'lol', 'omg', 'wtf', 'idk', 'fyi', 'tbh', 'lmao', 'asap', 'thanks', 'thank', 'thx', 
                'trav', 'bastante', 'muchisimo', 'muchisimos', 'muchisima', 'muchisimas', 'muchisimar', 'monton', 'habia', 'xe', 'jo', 'ops', 'ups', 'yupi', 'poca', 'poco', 'enhorabuena', 'incluso', 'igual', 'ahora', 'despu', 'pese', 'ser', 'cierto', 'haber', 'for', 'par', 'universidad', 'with', 'asimismo', 'carlos', 'universidad', 'iii', 'ser', 'entonces', 'madrid', 'tras', 'jo él', 'hacer', 'espán', 'noticia', 'as', 'periodico', 'diario', 'news', 'si', 'aca', 'mas', 'ademas', 'gracias', 'aqui', 'hola', 'saludos', 'hoy', 'co', 'bien', 'ver', 'dar', 'vosotrxs', 'wow', 'ahora', 'solo', 'todo', 'cualquier', 'cualquiera', 'pues', 'vez', 'mismo', 'evidentemente', 'uee', 'hey', 'hello', 'xvii', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xviii', 'ixx', 'xx', 'xxi', 
                'parte', 'normalmente', 'mil', 'adio', 'millon', '!', 'yosoyurjc', 'haced', 'hacerte', 'mientras', 'menos', 'después', 'despues', 'dias', 'cuyo', 'cuya', 'of', '_', 'aun', 'nunca', 'siempre', 'muchisima', 'dia', 'algun', 'pon', 'sino', 'mejor', 'poca', 'peor', 'ano', 'nuevo', '200', '-', 'uee', 'href', 'uno', 'dos', 'tres', 'espán', 'medal él', 'desmontar él', 'jo él', 'joder', 'char él', 'queydondeestudiar', 'mer él', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve', 'diez', 'veintena', 'doce', 'docena', 'trece', 'veitiuno', 'cien', 'decena', 'x2', '–', 'https', 'http', 'rt', 'm', 'uc', 'alguien', 'cada', 'latest', 'tmb', 'via', 'tal', 'etc', 'etcetera', 'traves', 'ma', 'ahi', 'aqui', 'alli', 'alla', 'so', 'if', 'mmm', 'hmm', 'ja', 'cuidadosamente', 'quizas', 'quiza', 'nada', 'nunca', 'probablemente', 'mismisimo', 'totalmente', 'completamente'])

stop_words.extend(['informacion', 'titulo', 'alumno', 'estudiar', 'profesor', 'primerisimo', 'adema', 'yomequedoencasa'])

# Frequent words that add litle meaning: Hyperparameter in gridsearch
extras = ['seguro', 'realizar', 'titulo', 'clase', 'tener', 'necesitar', 'querer', 'abrir', 'agarrar', 'andar', 'caminar', 'buscar', 'caer', 'conocer', 'saber', 'hablar', 'tener', 'tomar', 'poner', 'dar', 'ir', 'decir', 'estar', 'ser', 'vivir', 'oir', 'poner', 'traer', 'sentir', 'ver', 'mirar', 'llegar', 'llevar', 'entender', 'oler', 'salir', 'comprender', 'rayar', 'tener', 'flipar', 'putear', 'darle', 'saber', 'pasar', 'poner', 'esperar', 'dejar', 'parecer', 'salir', 'seguir', 'creer', 'opinar', 'informacion', 'alumno']


  data['text'] = data['text'].str.replace('\d+', '')


OSError: ignored

In [25]:
# spacy.load('es_core_news_sm')

!python -m spacy download es_core_news_sm

Collecting es_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2 MB)
[K     |████████████████████████████████| 16.2 MB 5.2 MB/s 
Building wheels for collected packages: es-core-news-sm
  Building wheel for es-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for es-core-news-sm: filename=es_core_news_sm-2.2.5-py3-none-any.whl size=16172933 sha256=d5309bbd5dcefb01e96de7e65d80d0eedca80c54aa9937da351dc10a74676dd1
  Stored in directory: /tmp/pip-ephem-wheel-cache-ofxql430/wheels/21/8d/a9/6c1a2809c55dd22cd9644ae503a52ba6206b04aa57ba83a3d8
Successfully built es-core-news-sm
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [22]:
import es_core_web_sm

ModuleNotFoundError: ignored

KeyboardInterrupt: ignored

# Nueva sección

In [None]:

"""

PRE-PROCESS: 

0. Get the unduplicated / non-retweeted tweets. 


NON-TUNED PRE-PROCESSING STEPS:

1. Remove the hashtags, links and shares

2. Lowcase 

3. Remove punctuation 

4. Remove accents 

5. Remove numbers (TBC)

6. Lemmatise 

7. Make some corrections uncovered above


TUNED STEPS:

8. Remove stopwords (TBC properly) 

9. Remove short tweets 

10. Group by author and gridsearch best LDA hyperparameters.  

"""


# 1. Remove links, hashtags and mentions
# --------------------------------------

def hashtag(txt):
    
    # Remove hashtags
    result = ' '.join(word for word in txt.split(' ') if not word.startswith('#'))
    
    # Remove shares
    result = ' '.join(word for word in result.split(' ') if not word.startswith('@'))
    
    # Remove some jajajas
    result = ' '.join(word for word in result.split(' ') if 'jaja' not in word.lower())
    result = ' '.join(word for word in result.split(' ') if 'jj' not in word.lower())
    result = ' '.join(word for word in result.split(' ') if 'haha' not in word.lower())

    # Remove links
    result = re.sub(r'http\S+', '', result)

    # Further work with potential remainings
    result = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", result)
    result = re.sub(r'[¿“‘?’"„“<>,!"]', "", result)
    result = re.sub('!', "", result)
    
    return result

    
data['text'] = data['text'].apply(hashtag)


# 2. Lowcase 
# --------------------------------------

data['text'] = data['text'].apply(np.char.lower)



# 3. Remove punctuation from text
# --------------------------------------

from gensim.parsing.preprocessing import strip_punctuation
data['text'] = data['text'].apply(strip_punctuation)



# 4. Remove accents from text
# --------------------------------------

import unidecode
data['text'] = data['text'].apply(unidecode.unidecode)
data['text'] = data['text'].apply(gensim.utils.deaccent) # In case we apply both 



# 5. Remove numbers
# --------------------------------------

data['text'] = data['text'].str.replace('\d+', '')



# 6. Lemmatisation
# --------------------------------------
# The purpose of this code is to lemmatize (Convert to dictionary form) the words in each document, 
# keeping the same format

nlp = spacy.load('es_core_news_sm', exclude = ['derechos', 'expres', 'ademas', 'traves', 'adios'], disable = ['derechos', 'expres', 'ademas', 'traves', 'adios'])

def lemmatizer(text):  
  doc = nlp(text)
  return ' '.join([word.lemma_ for word in doc])

data['lemmatized'] = data['text'].apply(lambda x: lemmatizer(x)) 



# 7. Further cleaning
# --------------------------------------

def corrector(text):     
    text = text.replace(' lumnos ', ' alumno ') 
    text = text.replace(' estudiante ', ' alumno ') # Synonims I think it's worth having them as the same word
    text = text.replace(' avda ', ' avenida ')
    text = text.replace(' info ', ' informacion ')
    text = text.replace(' uni ', ' universidad ')
    text = text.replace(' almerio ', ' almeria ')
    text = text.replace(' estudio ', ' estudiar ')
    text = text.replace(' cambier ', ' cambiar ')
    
    return text

data['lemmatized'] = data['lemmatized'].apply(corrector)

from gensim.parsing.preprocessing import strip_multiple_whitespaces
data['lemmatized'] = data['lemmatized'].apply(strip_multiple_whitespaces)



# 8. Stopwords work 
# --------------------------------------

# Find the list of stopwords

# Note that the removal depends on the library and will be part of each. 
stop_words = stopwords.words('spanish')
stop_words_extension = get_stop_words('es')
stop_words.extend(stop_words_extension)

# Remove accents from stopwords (I'll work with no accents anywhere, since we don't expect people to write always with accents) 
stop_words = [gensim.utils.deaccent(each_word) for each_word in stop_words]

stop_words.extend(['interesante', 'universidadjuancarlos', 'él', 'hala', 'juanca', 'reyjuancar', 'urjcritica', 'this', 'was', 'date', 'great', 'my', 'first', 'claro', 'queydondeestudiar2020', 'rey', 'juan', 'literalmente', 'universidadreyjuancarlo', 'universidadjuancarlos', 'lareyjuancarlos', 'universidadreyjuancarlos', 'universidadreyjuancarlo', 'urjc', 'gracia', 'gracias','ajjaj', 'xd', 'xdd', 'xdxd', 'Madrid', 'primero', 'segundo', 'tercero', 'cuarto', 'quinto', 'sexto', 'septimo', 'octavo', 't', 'a', 's', 'k', 'q', 'mas', 'tambien', 'ir', 'alla', 'cosa', 'iee', 'siquiera', 'in', 'the', 'of', 'ano', 'n', 'p', 'r', 'asi', 'coincidir', 'aa', 'decir', '¡', '¿', 'poder', 'podeis', 'estais', 'traves', 'alguno', 'buen', 'nosotrxs', 'bufff', 'buff', 'universitario', 'lol', 'omg', 'wtf', 'idk', 'fyi', 'tbh', 'lmao', 'asap', 'thanks', 'thank', 'thx', 
                'trav', 'bastante', 'muchisimo', 'muchisimos', 'muchisima', 'muchisimas', 'muchisimar', 'monton', 'habia', 'xe', 'jo', 'ops', 'ups', 'yupi', 'poca', 'poco', 'enhorabuena', 'incluso', 'igual', 'ahora', 'despu', 'pese', 'ser', 'cierto', 'haber', 'for', 'par', 'universidad', 'with', 'asimismo', 'carlos', 'universidad', 'iii', 'ser', 'entonces', 'madrid', 'tras', 'jo él', 'hacer', 'espán', 'noticia', 'as', 'periodico', 'diario', 'news', 'si', 'aca', 'mas', 'ademas', 'gracias', 'aqui', 'hola', 'saludos', 'hoy', 'co', 'bien', 'ver', 'dar', 'vosotrxs', 'wow', 'ahora', 'solo', 'todo', 'cualquier', 'cualquiera', 'pues', 'vez', 'mismo', 'evidentemente', 'uee', 'hey', 'hello', 'xvii', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xviii', 'ixx', 'xx', 'xxi', 
                'parte', 'normalmente', 'mil', 'adio', 'millon', '!', 'yosoyurjc', 'haced', 'hacerte', 'mientras', 'menos', 'después', 'despues', 'dias', 'cuyo', 'cuya', 'of', '_', 'aun', 'nunca', 'siempre', 'muchisima', 'dia', 'algun', 'pon', 'sino', 'mejor', 'poca', 'peor', 'ano', 'nuevo', '200', '-', 'uee', 'href', 'uno', 'dos', 'tres', 'espán', 'medal él', 'desmontar él', 'jo él', 'joder', 'char él', 'queydondeestudiar', 'mer él', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve', 'diez', 'veintena', 'doce', 'docena', 'trece', 'veitiuno', 'cien', 'decena', 'x2', '–', 'https', 'http', 'rt', 'm', 'uc', 'alguien', 'cada', 'latest', 'tmb', 'via', 'tal', 'etc', 'etcetera', 'traves', 'ma', 'ahi', 'aqui', 'alli', 'alla', 'so', 'if', 'mmm', 'hmm', 'ja', 'cuidadosamente', 'quizas', 'quiza', 'nada', 'nunca', 'probablemente', 'mismisimo', 'totalmente', 'completamente'])

stop_words.extend(['informacion', 'titulo', 'alumno', 'estudiar', 'profesor', 'primerisimo', 'adema', 'yomequedoencasa'])

# Frequent words that add litle meaning: Hyperparameter in gridsearch
extras = ['seguro', 'realizar', 'titulo', 'clase', 'tener', 'necesitar', 'querer', 'abrir', 'agarrar', 'andar', 'caminar', 'buscar', 'caer', 'conocer', 'saber', 'hablar', 'tener', 'tomar', 'poner', 'dar', 'ir', 'decir', 'estar', 'ser', 'vivir', 'oir', 'poner', 'traer', 'sentir', 'ver', 'mirar', 'llegar', 'llevar', 'entender', 'oler', 'salir', 'comprender', 'rayar', 'tener', 'flipar', 'putear', 'darle', 'saber', 'pasar', 'poner', 'esperar', 'dejar', 'parecer', 'salir', 'seguir', 'creer', 'opinar', 'informacion', 'alumno']
