### A. PREPARAMOS LA BBDD PARA INTRODUCIR EN LOS MODELOS

### LIMPIEZA UTILIZADA

In [8]:
# Abrimos el fichero anterior de la carpeta
import pandas as pd
df = pd.read_csv("../Procesamiento_Datos/full-data.csv") 

In [9]:
df.keys()

Index(['Unnamed: 0', 'text', 'created_at', 'created_at_time',
       'created_at_hour', 'retweeted', 'retweet_count', 'favorite_count',
       'user_ verified', 'user_id', 'user_name', 'user_location',
       'user_notificacion', 'user_followers', 'user_friends',
       'user_withheld_in_countries', 'mentions_in_tweet', 'is_reply',
       'source_device', 'finished_tweet', 'status_count', 'hashtags_text',
       'hastags_indices', 'hastags_in_tweet', 'possitivity_textblob',
       'possitivity_vader', 'possitivity_ibm', 'sentiment_mean',
       'sentiment_norm', 'sentiment', 'categories', 'topic_1', 'topic_2',
       'topic_3'],
      dtype='object')

In [11]:
# Defino las variables dependientes que eliminaremos del df
vars = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'text', 
        'user_id', 'created_at_time', 'user_name', 'hashtags_text', 'hastags_indices',
        'possitivity_textblob', 'possitivity_vader', 'possitivity_ibm', 
        'sentiment_mean', 'sentiment_norm', 'user_withheld_in_countries']
df_limpio = df.drop(vars,axis=1)

In [12]:
df_limpio.shape

(7809, 21)

In [13]:
df_limpio.dtypes

created_at            object
created_at_hour        int64
retweeted             object
retweet_count          int64
favorite_count         int64
user_ verified       float64
user_location         object
user_notificacion       bool
user_followers         int64
user_friends           int64
mentions_in_tweet      int64
is_reply              object
source_device         object
finished_tweet          bool
status_count           int64
hastags_in_tweet       int64
sentiment             object
categories            object
topic_1               object
topic_2               object
topic_3               object
dtype: object

In [14]:
df_limpio.isna().sum()

created_at              0
created_at_hour         0
retweeted               0
retweet_count           0
favorite_count          0
user_ verified       7809
user_location        2686
user_notificacion       0
user_followers          0
user_friends            0
mentions_in_tweet       0
is_reply                0
source_device           0
finished_tweet          0
status_count            0
hastags_in_tweet        0
sentiment               0
categories              0
topic_1                 0
topic_2               831
topic_3              4465
dtype: int64

#### Quitamos categóricas y convertimos en dummies (OneHotEncoder)
Para que los modelos funcionen, tenemos que tener las categorías como dummies, sino pueden dar fallos, por ello realizamos la siguiente transformación. 

Para ello es necesario hacer algo de limpieza en las variables categóricas, recategorizar o agrupar categorías. Estudiamos las clases de las variables categóricas. 

In [15]:
# Vemos el resumen de las categóricas
df_limpio.select_dtypes(['object']).keys()

Index(['created_at', 'retweeted', 'user_location', 'is_reply', 'source_device',
       'sentiment', 'categories', 'topic_1', 'topic_2', 'topic_3'],
      dtype='object')

In [16]:
df_limpio[['created_at']].value_counts() # OK

created_at
Mon Aug 09    5827
Wed Aug 11    1982
dtype: int64

In [17]:
df_limpio[['retweeted']].value_counts() # OK

retweeted
No           6325
Si           1484
dtype: int64

In [18]:
df_limpio[['is_reply']].value_counts()

is_reply       
No                 4181
Uber_Support        237
Uber                101
Uber_Kryptonian      59
joncoopertweets      39
                   ... 
PeakSamu18            1
Pearlofwisdommm       1
Pedram_Navid          1
PedroHmaumau          1
UnicornWolfLAZR       1
Length: 2548, dtype: int64

In [19]:
# Cambio is_reply a una variable binaria (Si reply o No o Uber, porque nos interesa)

cond1 = df_limpio['is_reply'].isin(['No','Uber_Support','Uber'])
df_limpio['is_reply'] = df_limpio['is_reply'].where(cond1,'Si')

In [20]:
df_limpio[['is_reply']].value_counts()

is_reply    
No              4181
Si              3290
Uber_Support     237
Uber             101
dtype: int64

In [21]:
# Uno categorías Uber_support y Uber
cond2 = df_limpio['is_reply'].isin(['Si','No','Uber'])
df_limpio['is_reply'] = df_limpio['is_reply'].where(cond2,'Uber')

In [22]:
df_limpio[['is_reply']].value_counts() # OK

is_reply
No          4181
Si          3290
Uber         338
dtype: int64

In [23]:
df_limpio[['source_device']].value_counts()

source_device                 
Twitter for iPhone                3408
Twitter for Android               1843
Twitter Web App                   1254
Sprinklr                           523
test-shield-bot                    223
                                  ... 
Microsoft Power Platform             1
MarketChameleon.com                  1
Maiyro - Bloggers Social Media       1
Magnifintech                         1
はてな / Hatena                         1
Length: 152, dtype: int64

In [24]:
# Creamos 5 categorías, las 4 más representadas y 'Otros'
# Uno categorías Uber_support y Uber
cond3 = df_limpio['source_device'].isin(['Twitter for iPhone','Twitter for Android',
                                         'Twitter Web App','Sprinklr'])
df_limpio['source_device'] = df_limpio['source_device'].where(cond3,'Otro')

In [25]:
df_limpio[['source_device']].value_counts()

source_device      
Twitter for iPhone     3408
Twitter for Android    1843
Twitter Web App        1254
Otro                    781
Sprinklr                523
dtype: int64

In [26]:
df_limpio[['user_location']].value_counts()[0:20]

user_location             
Stockholm, Sweden             225
London, England               121
United States                  77
Los Angeles, CA                59
India                          54
Chicago, IL                    52
New York, NY                   51
California, USA                50
London                         43
Atlanta, GA                    37
Washington, DC                 35
New York, USA                  31
Lagos, Nigeria                 31
United Kingdom                 27
United Kingdom                 25
Johannesburg, South Africa     25
UK                             24
York, England                  23
New Delhi, India               23
Brooklyn, NY                   22
dtype: int64

In [27]:
# Creamos  categorías representadas con más de 50 repeticiones
# A esta variable habrá que tratarla un poquito mejor en el futuro. 

cond3 = df_limpio['user_location'].isin(['Stockholm, Sweden', 'United States', 
                                         'Los Angeles, CA', 'India', 'Chicago, IL',
                                         'New York, NY','California, USA'])
df_limpio['user_location'] = df_limpio['user_location'].where(cond3,'Otro')

In [28]:
df_limpio[['user_location']].value_counts()

user_location    
Otro                 7241
Stockholm, Sweden     225
United States          77
Los Angeles, CA        59
India                  54
Chicago, IL            52
New York, NY           51
California, USA        50
dtype: int64

In [29]:
df_limpio

Unnamed: 0,created_at,created_at_hour,retweeted,retweet_count,favorite_count,user_ verified,user_location,user_notificacion,user_followers,user_friends,...,is_reply,source_device,finished_tweet,status_count,hastags_in_tweet,sentiment,categories,topic_1,topic_2,topic_3
0,Wed Aug 11,9,Si,1,0,,"Stockholm, Sweden",False,20504,98,...,No,Otro,False,695888,0,Negativo,"[{'score': 0.693098, 'label': '/travel/special...",travel,specialty travel,adventure travel
1,Wed Aug 11,9,No,0,0,,Otro,False,2733,970,...,No,Twitter Web App,False,58398,0,Neutro,"[{'score': 0.59693, 'label': '/sports/baseball...",sports,baseball,
2,Wed Aug 11,9,No,0,0,,Otro,False,353,943,...,Si,Twitter for iPhone,False,5974,0,Neutro,"[{'score': 0.785107, 'label': '/sports/soccer'}]",sports,soccer,
3,Wed Aug 11,9,No,0,0,,Otro,False,6,1,...,Si,Twitter for Android,False,169,0,Positivo,"[{'score': 0.685688, 'label': '/science/mathem...",science,mathematics,arithmetic
4,Wed Aug 11,9,No,0,0,,Otro,False,5562,3,...,No,Otro,False,113083,0,Negativo,"[{'score': 0.778195, 'label': '/news'}, {'scor...",news,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7804,Mon Aug 09,13,No,0,0,,Otro,False,0,4,...,Si,Twitter for iPhone,False,21,0,Neutro,"[{'score': 0.998438, 'label': '/automotive and...",automotive and vehicles,cars,
7805,Mon Aug 09,13,No,0,1,,Otro,False,477,435,...,Si,Twitter for iPhone,False,5828,0,Negativo,"[{'score': 0.780402, 'label': '/sports/soccer'}]",sports,soccer,
7806,Mon Aug 09,13,No,0,2,,Otro,False,3215,4690,...,Si,Otro,False,29551,0,Muy Negativo,"[{'score': 0.841392, 'label': '/automotive and...",automotive and vehicles,electric vehicles,
7807,Mon Aug 09,13,No,0,0,,Otro,False,409,870,...,No,Twitter for iPhone,False,54109,0,Muy Negativo,"[{'score': 0.60174, 'label': '/art and enterta...",art and entertainment,shows and events,


In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Creamos un One Hot Encoder que nos ayude a dummificar las nuevas entradas
# de las variables categóricas que incluyamos

col_transformer = ColumnTransformer([
    ('created_at', OneHotEncoder(drop='first'), ['created_at']),
    ('retweeted', OneHotEncoder(drop=['No']), ['retweeted']), 
    ('is_reply', OneHotEncoder(drop=['No']), ['is_reply']), 
    ('source_device', OneHotEncoder(drop=['Otro']), ['source_device']),
    ('user_location', OneHotEncoder(drop=['Otro']), ['user_location'])
])

### B. RESULTADO DE APLICAR PRIMEROS MODELOS

- LR: 0.424510 (0.017124)
- RFC: 0.414779 (0.018737)
- SVM: 0.416700 (0.017998)