# PREPROCESSING

In [6]:
%matplotlib inline
import pandas
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import numpy as np

pandas.set_option('display.max_columns', None)  
pandas.set_option('display.expand_frame_repr', False)
pandas.set_option('display.precision', 3)
df = pandas.read_csv('data.csv', sep=',', na_values="")
print(df.head())

       Date      Time                                         Tweet_Text  Type Media_Type   Hashtags   Tweet_Id                                          Tweet_Url  twt_favourites_IS_THIS_LIKE_QUESTION_MARK  Retweets  Unnamed: 10  Unnamed: 11
0  16-11-11  15:26:37  Today we express our deepest gratitude to all ...  text      photo  ThankAVet  7.970e+17  https://twitter.com/realDonaldTrump/status/797...                                     127213     41112          NaN          NaN
1  16-11-11  13:33:35  Busy day planned in New York. Will soon be mak...  text        NaN        NaN  7.970e+17  https://twitter.com/realDonaldTrump/status/797...                                     141527     28654          NaN          NaN
2  16-11-11  11:14:20  Love the fact that the small groups of protest...  text        NaN        NaN  7.970e+17  https://twitter.com/realDonaldTrump/status/797...                                     183729     50039          NaN          NaN
3  16-11-11   2:19:44  Just had 

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Preparación de las herramientas de preprocesamiento de texto
nltk.download('stopwords')
stop = set(stopwords.words('english')) 
sno = SnowballStemmer('english') 

def cleanhtml(sentence): 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext

def cleanpunc(sentence): 
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return cleaned

# Procesamiento del texto de los tweets
final_string = []
for sent in df['Tweet_Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent) 
    for w in sent.split():
        cleaned_words = cleanpunc(w)
        if cleaned_words.isalpha() and len(cleaned_words) > 2:
            if cleaned_words.lower() not in stop:
                stemmed_word = sno.stem(cleaned_words.lower())
                filtered_sentence.append(stemmed_word)
    final_string.append(" ".join(filtered_sentence))

# Añadiendo la columna de tweets limpios al DataFrame
df['cleaned_tweet'] = final_string

# Visualización de los primeros registros del DataFrame modificado
print(df["cleaned_tweet"].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aleja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    today express deepest gratitud serv arm thankavet
1    busi day plan new soon make import decis peopl...
2    love fact small group protest last night passi...
3      open success presidenti profession incit unfair
4    fantast day met presid obama first realli good...
Name: cleaned_tweet, dtype: object


In [10]:
media_likes= df["twt_favourites_IS_THIS_LIKE_QUESTION_MARK"].median()
media_retweets=df["Retweets"].median()
df['Viral'] = df.apply(lambda x: 'mucho' if (x['twt_favourites_IS_THIS_LIKE_QUESTION_MARK'] + x['Retweets']) >= (media_likes + media_retweets) * 2
                             else ('poco'), axis=1)
print(df["Viral"].value_counts())

Viral
poco     4907
mucho    2468
Name: count, dtype: int64


In [11]:
df.drop('Tweet_Text', axis=1, inplace=True)
df.drop('Tweet_Id', axis=1, inplace=True)
df.drop('Tweet_Url', axis=1, inplace=True)
df.drop('Date', axis=1, inplace=True)
df.drop('Time', axis=1, inplace=True)
df.drop('Media_Type', axis=1, inplace=True)
df.drop('Type', axis=1, inplace=True)
df.drop('Hashtags', axis=1, inplace=True)
df.drop('twt_favourites_IS_THIS_LIKE_QUESTION_MARK', axis=1, inplace=True)
df.drop('Retweets', axis=1, inplace=True)
df.drop('Unnamed: 10', axis=1, inplace=True)
df.drop('Unnamed: 11', axis=1, inplace=True)
y=df["Viral"].values
# Definir X como todas las columnas excepto 'Viral'
X= df.drop('Viral', axis=1).values
print(df[0:5])

                                       cleaned_tweet  Viral
0  today express deepest gratitud serv arm thankavet  mucho
1  busi day plan new soon make import decis peopl...  mucho
2  love fact small group protest last night passi...  mucho
3    open success presidenti profession incit unfair  mucho
4  fantast day met presid obama first realli good...  mucho


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform( X[:, -1])
feature_names = count_vectorizer.get_feature_names_out()
X = pandas.DataFrame(X.toarray(), columns=feature_names)
word_frequencies = X.sum(axis=0)
filtered_words = word_frequencies[word_frequencies >= 5]
X = X[filtered_words.index]
X.to_csv('X.csv', index=False)
df.drop('cleaned_tweet', axis=1, inplace=True)
df.to_csv('y.csv', index=False)
print(X[0:5])
print(y[0:5])

   abc  abl  abolish  absolut  accept  accord  account  across  act  action  actual  ad  addit  address  administr  admit  advertis  advic  advis  afford  afraid  african  afternoon  agent  ago  agre  ahead  aid  air  airlin  alabama  alien  allow  almost  along  alreadi  also  alway  amaz  amend  america  americafirst  american  amnesti  among  amount  anchor  anderson  angri  announc  anoth  answer  anticip  anybodi  anyon  anyth  apolog  appear  appreci  apprentic  approv  april  arena  arent  arizona  arm  around  arriv  articl  asham  ask  asktrump  ass  attack  attempt  attend  attent  audienc  avail  averag  away  awesom  babi  back  bad  balanc  ballot  ban  bank  bar  barack  barbara  base  bash  bay  beat  beaten  beauti  becam  beck  becom  beg  begin  behalf  behind  believ  ben  benefit  benghazi  berni  best  better  beyond  bias  big  bigger  biggest  bigleaguetruth  bill  billion  birthday  black  blame  bless  blew  block  bloomberg  blow  blown  blue  board  bob  bobb