Jeyner Arango 201106

Oscar Méndez 20402

# Laboratorio 9

In [1]:
import numpy as np
import pandas as pd


import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from afinn import Afinn

import plotly.express as px
import dash
from dash.dependencies import Input, Output
import dash_core_components as dcc
import dash_html_components as html

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html


## Procesamiento de Datos

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
english_stop_words = stopwords.words('english')
pd.set_option('display.max_colwidth', 140)
df = pd.read_csv('train.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osjom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\osjom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def clean_tweets(tweet):
    """ 
    removemos handles @user, links https, numeros,
    pasamos a minuscula, removemos numeros, partimos texto,
    eliminamos espacios innecesarios y volvemos a unir texto 

    @return texto limpio
    """
    tweet = tweet.replace("%20", " ").replace("&amp;", "&")
    user_removed = re.sub(r'@[A-Za-z0-9]+','',tweet)
    link_removed = re.sub('https?://[A-Za-z0-9./]+','',user_removed)
    number_removed = re.sub('[^a-zA-Z]', ' ', link_removed)
    lower_case_tweet= number_removed.lower()
    tok = WordPunctTokenizer()
    words = tok.tokenize(lower_case_tweet)
    clean_tweet = (' '.join(words)).strip()
    return clean_tweet

In [4]:
def remove_stop_words(tweet):
    removed_stop_words = ' '.join([word for word in tweet.split() if word not in english_stop_words])
    return removed_stop_words

In [5]:
def lemmatize_tweet(tweet):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tweet.split()]
    lemmatized_tweet = ' '.join(lemmatized_words)
    return lemmatized_tweet

In [6]:
df['text_length'] = df['text'].apply(len)
df['text'] = df['text'].apply(clean_tweets).apply(lemmatize_tweet).apply(remove_stop_words)

In [7]:
afinn = Afinn()
df['text_sentiment'] = df['text'].apply(afinn.score)

## Analisis de Datos

In [8]:
df.head()

Unnamed: 0,id,keyword,location,text,target,text_length,text_sentiment
0,1,,,deed reason earthquake may allah forgive u,1,69,1.0
1,4,,,forest fire near la ronge sask canada,1,38,-2.0
2,5,,,resident asked shelter place notified officer evacuation shelter place order expected,1,133,-1.0
3,6,,,people receive wildfire evacuation order california,1,65,-1.0
4,7,,,got sent photo ruby alaska smoke wildfire pours school,1,88,0.0


## Dashboard

In [9]:
app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])

app.layout = html.Div(children=[
    html.H1(children='Tweets', style={'textAlign': 'center', 'fontFamily': 'Poppins', 'color': '#e4eef2'}),
    
    # Filter for the target
    html.Div(children=[
        dcc.Checklist(
            id='target-checklist',
            options=[
                {'label': 'Target 0', 'value': 0},
                {'label': 'Target 1', 'value': 1}
            ],
            value=[0, 1]
        )
    ], style={'textAlign': 'center', 'fontFamily': 'Poppins', 'color': '#e4eef2'}),

    html.Div([
        # Countplot of the country (top 10)
        html.Div(children=[
            dcc.Graph(
                id='country-countplot'
            )
        ], style={'width': '50%', 'display': 'inline-block', 'fontFamily': 'Poppins'}),

        # Distribution plot for the sentiment
        html.Div(children=[
            dcc.Graph(
                id='sentiment-distribution-plot'
            )
        ], style={'width': '50%', 'display': 'inline-block', 'fontFamily': 'Poppins'})
    ]),

    html.Div([
        # Countplot of the keyword (top 10)
        html.Div(children=[
            dcc.Graph(
                id='keyword-countplot'
            )
        ], style={'width': '50%', 'display': 'inline-block', 'fontFamily': 'Poppins'}),

        # Distribution plot of the text length
        html.Div(children=[
            dcc.Graph(
                id='text-length-distribution-plot'
            )
        ], style={'width': '50%', 'display': 'inline-block', 'fontFamily': 'Poppins'})
    ], style={'fontFamily': 'Poppins'})
], style={'fontFamily': 'Poppins', 'backgroundColor': '#111212'})


@app.callback(
    [Output('country-countplot', 'figure'),
     Output('sentiment-distribution-plot', 'figure'),
     Output('keyword-countplot', 'figure'),
     Output('text-length-distribution-plot', 'figure')],
    [Input('target-checklist', 'value')]
)
def update_plots(selected_targets):
    filtered_df = df[df['target'].isin(selected_targets)]

    country_countplot = px.bar(filtered_df['location'].value_counts()[:10], title='Country Count').update_layout(
        xaxis_title="Country",
        yaxis_title="Count",
        template="plotly_dark"
    )

    sentiment_distribution_plot = px.histogram(filtered_df, x="text_sentiment", title="Sentiment Distribution").update_layout(
        xaxis_title="Sentiment",
        yaxis_title="Frequency",
        template="plotly_dark"
    )

    keyword_countplot = px.bar(filtered_df['keyword'].value_counts()[:10], title='Keyword Count').update_layout(
        xaxis_title="Keyword",
        yaxis_title="Count",
        template="plotly_dark"
    )

    text_length_distribution_plot = px.histogram(filtered_df, x="text_length", title="Text Length Distribution").update_layout(
        xaxis_title="Text Length",
        yaxis_title="Frequency",
        template="plotly_dark"
    )

    return country_countplot, sentiment_distribution_plot, keyword_countplot, text_length_distribution_plot


if __name__ == '__main__':
    app.run_server(debug=True)