
##Atividade de preparação de dados - análise exploratória

In [1]:
# @title Importação das Bibliotecas Essenciais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from io import StringIO

pd.set_option('display.max_columns', 50)
sns.set_theme(style="whitegrid")
print("Bibliotecas importadas com sucesso!")

Bibliotecas importadas com sucesso!


In [9]:
# @title Carregando dados

df = pd.read_csv('../data/raw/dados_ux_15.csv')

print("\nContagem de Valores Ausentes Antes do Tratamento:")
print(df.isnull().sum())


Contagem de Valores Ausentes Antes do Tratamento:
user_id                     0
age                         2
gender                      0
device_type                 0
internet_connection_type    0
screens_visited             0
gestures_used               4
app_rating                  0
sentiment_analysis          0
app_load_time               0
crash_report                0
battery_usage               1
memory_usage                2
font_size_preference        1
dark_mode_enabled           0
time_of_day                 0
day_of_week                 2
app_version                 0
dtype: int64


In [10]:
# @title Corrigindo dados ausentes

screens_visited = df['screens_visited'].mode()[0]
df['screens_visited'] = df['screens_visited'].fillna(screens_visited)
print(f"screens_visited imputada com moda: {screens_visited}")

memory_usage = df['memory_usage'].median()
df['memory_usage'] = df['memory_usage'].fillna(memory_usage)
print(f"memory_usage imputado com a mediana: {memory_usage}")

age = df['age'].median()
df['age'] = df['age'].fillna(age)
print(f"age imputado com a mediana: {age}")

font_size_preference = df['font_size_preference'].mode()[0]
df['font_size_preference'] = df['font_size_preference'].fillna(font_size_preference)
print(f"font_size_preference imputada com a moda: {font_size_preference}")

gestures_used = df['gestures_used'].mode()[0]
df['gestures_used'] = df['gestures_used'].fillna(gestures_used)
print(f"gestures_used imputada com a moda: {gestures_used}")

day_of_week = df['day_of_week'].mode()[0]
df['day_of_week'] = df['day_of_week'].fillna(day_of_week)
print(f"day_of_week imputada com a moda: {day_of_week}")

print("\nContagem de Valores Ausentes Após Imputação:")
print(df.isnull().sum())

print("\nDimensões")
display(df.shape)

screens_visited imputada com moda: Home;Cart;Checkout
memory_usage imputado com a mediana: 210.0
age imputado com a mediana: 29.0
font_size_preference imputada com a moda: Medium
gestures_used imputada com a moda: Tap;Swipe
day_of_week imputada com a moda: Segunda-feira

Contagem de Valores Ausentes Após Imputação:
user_id                     0
age                         0
gender                      0
device_type                 0
internet_connection_type    0
screens_visited             0
gestures_used               0
app_rating                  0
sentiment_analysis          0
app_load_time               0
crash_report                0
battery_usage               1
memory_usage                0
font_size_preference        0
dark_mode_enabled           0
time_of_day                 0
day_of_week                 0
app_version                 0
dtype: int64

Dimensões


(15, 18)

In [11]:
# @title Corrigindo inconsistências de Strings

df['device_type'] = df['device_type'].str.lower()
df['device_type'].replace({'Android': 'android'}, inplace=True)
print("\nValores únicos de device_type após padronização:")
print(df['device_type'].value_counts())

df['time_of_day'] = df['time_of_day'].str.lower()
df['time_of_day'].replace({'Manhã': 'manhã'}, inplace=True)
print("\nValores únicos de time_of_day após padronização:")
print(df['time_of_day'].value_counts())

df['sentiment_analysis'] = df['sentiment_analysis'].str.lower()
df['sentiment_analysis'].replace({'Positivo': 'positivo'}, inplace=True)
print("\nValores únicos de sentiment_analysis após padronização:")
print(df['sentiment_analysis'].value_counts())

df['gender'] = df['gender'].replace({'Não-binário': 'Non-Binary'})
print("\nValores únicos de gender após padronização:")
print(df['gender'].value_counts(dropna=False))

display(df)
print(df.isnull().sum())


Valores únicos de device_type após padronização:
device_type
android    8
ios        7
Name: count, dtype: int64

Valores únicos de time_of_day após padronização:
time_of_day
manhã    5
tarde    4
noite    4
manha    2
Name: count, dtype: int64

Valores únicos de sentiment_analysis após padronização:
sentiment_analysis
positivo    9
negativo    4
neutro      2
Name: count, dtype: int64

Valores únicos de gender após padronização:
gender
Male          6
Female        5
Non-Binary    4
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['device_type'].replace({'Android': 'android'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_of_day'].replace({'Manhã': 'manhã'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

Unnamed: 0,user_id,age,gender,device_type,internet_connection_type,screens_visited,gestures_used,app_rating,sentiment_analysis,app_load_time,crash_report,battery_usage,memory_usage,font_size_preference,dark_mode_enabled,time_of_day,day_of_week,app_version
0,1,32.0,Male,android,4G,Home;Profile;Search,Tap;Swipe,4,positivo,3.2,False,18,210.0,Medium,True,tarde,Segunda-feira,2.1.0
1,2,29.0,Female,ios,Wi-Fi,Home;Cart;Checkout,Tap,5,positivo,1.8,False,12,150.0,Small,False,manhã,Quarta-feira,2.2.0
2,3,45.0,Non-Binary,android,5G,Home;Settings;Help,Tap;Pinch-to-zoom,3,neutro,4.5,False,25,340.0,Large,True,noite,Sexta-feira,2.1.5
3,4,29.0,Male,ios,Wi-Fi,Home;Search;Profile,Tap;Swipe,2,negativo,5.1,True,30,210.0,Small,True,manha,Terca-feira,2.0.1
4,5,19.0,Male,android,5G,Home;Search;Cart;Checkout,Swipe;Tap,5,positivo,2.9,False,20,200.0,Medium,False,tarde,Segunda-feira,2.1.0
5,6,50.0,Female,ios,Wi-Fi,Home;Profile;Settings,Tap,3,positivo,4.2,True,high,400.0,Medium,True,manhã,Quinta-feira,2.1.5
6,7,22.0,Non-Binary,android,4G,Home;Help,Tap;Swipe,4,positivo,3.5,False,22,190.0,Small,False,noite,Domingo,2.2.0
7,8,38.0,Male,ios,5G,Home;Search;Cart,Tap;Swipe,6,negativo,6.0,True,40,500.0,Large,True,tarde,Sexta-feira,2.0.1
8,9,27.0,Female,android,Wi-Fi,Home;Profile,Tap;Swipe,4,positivo,3.1,False,10,170.0,Medium,False,manhã,Segunda-feira,2.2.0
9,10,29.0,Non-Binary,android,Wi-Fi,Home;Profile;Help,Tap;Swipe,3,negativo,4.8,True,28,310.0,Medium,True,manha,Segunda-feira,2.1.5


user_id                     0
age                         0
gender                      0
device_type                 0
internet_connection_type    0
screens_visited             0
gestures_used               0
app_rating                  0
sentiment_analysis          0
app_load_time               0
crash_report                0
battery_usage               1
memory_usage                0
font_size_preference        0
dark_mode_enabled           0
time_of_day                 0
day_of_week                 0
app_version                 0
dtype: int64


In [12]:
# @title Corrigindo tipos de Dados inválidos

df.info()

df['battery_usage'] = pd.to_numeric(df['battery_usage'], errors='coerce') 
print("Tipo da coluna battery_usage após conversão:")
print(df.dtypes['battery_usage'])

battery_usage = df['battery_usage'].median()
df['battery_usage'] = df['battery_usage'].fillna(battery_usage)
print(f"battery_usage imputado com a mediana: {battery_usage}")

app_rating = df['app_rating'].median()

df['app_rating'] = df['app_rating'].replace({6: app_rating })
print("\nValores únicos de app_rating após padronização:")
print(df['app_rating'].value_counts(dropna=False))

display(df)
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   user_id                   15 non-null     int64  
 1   age                       15 non-null     float64
 2   gender                    15 non-null     object 
 3   device_type               15 non-null     object 
 4   internet_connection_type  15 non-null     object 
 5   screens_visited           15 non-null     object 
 6   gestures_used             15 non-null     object 
 7   app_rating                15 non-null     int64  
 8   sentiment_analysis        15 non-null     object 
 9   app_load_time             15 non-null     float64
 10  crash_report              15 non-null     bool   
 11  battery_usage             14 non-null     object 
 12  memory_usage              15 non-null     float64
 13  font_size_preference      15 non-null     object 
 14  dark_mode_en

Unnamed: 0,user_id,age,gender,device_type,internet_connection_type,screens_visited,gestures_used,app_rating,sentiment_analysis,app_load_time,crash_report,battery_usage,memory_usage,font_size_preference,dark_mode_enabled,time_of_day,day_of_week,app_version
0,1,32.0,Male,android,4G,Home;Profile;Search,Tap;Swipe,4,positivo,3.2,False,18.0,210.0,Medium,True,tarde,Segunda-feira,2.1.0
1,2,29.0,Female,ios,Wi-Fi,Home;Cart;Checkout,Tap,5,positivo,1.8,False,12.0,150.0,Small,False,manhã,Quarta-feira,2.2.0
2,3,45.0,Non-Binary,android,5G,Home;Settings;Help,Tap;Pinch-to-zoom,3,neutro,4.5,False,25.0,340.0,Large,True,noite,Sexta-feira,2.1.5
3,4,29.0,Male,ios,Wi-Fi,Home;Search;Profile,Tap;Swipe,2,negativo,5.1,True,30.0,210.0,Small,True,manha,Terca-feira,2.0.1
4,5,19.0,Male,android,5G,Home;Search;Cart;Checkout,Swipe;Tap,5,positivo,2.9,False,20.0,200.0,Medium,False,tarde,Segunda-feira,2.1.0
5,6,50.0,Female,ios,Wi-Fi,Home;Profile;Settings,Tap,3,positivo,4.2,True,21.0,400.0,Medium,True,manhã,Quinta-feira,2.1.5
6,7,22.0,Non-Binary,android,4G,Home;Help,Tap;Swipe,4,positivo,3.5,False,22.0,190.0,Small,False,noite,Domingo,2.2.0
7,8,38.0,Male,ios,5G,Home;Search;Cart,Tap;Swipe,4,negativo,6.0,True,40.0,500.0,Large,True,tarde,Sexta-feira,2.0.1
8,9,27.0,Female,android,Wi-Fi,Home;Profile,Tap;Swipe,4,positivo,3.1,False,10.0,170.0,Medium,False,manhã,Segunda-feira,2.2.0
9,10,29.0,Non-Binary,android,Wi-Fi,Home;Profile;Help,Tap;Swipe,3,negativo,4.8,True,28.0,310.0,Medium,True,manha,Segunda-feira,2.1.5


user_id                     0
age                         0
gender                      0
device_type                 0
internet_connection_type    0
screens_visited             0
gestures_used               0
app_rating                  0
sentiment_analysis          0
app_load_time               0
crash_report                0
battery_usage               0
memory_usage                0
font_size_preference        0
dark_mode_enabled           0
time_of_day                 0
day_of_week                 0
app_version                 0
dtype: int64


In [None]:
# @title Criação do novo Arquivo csv e excel
import os

os.makedirs('./data/processed', exist_ok=True)

try:
    csv_path = '../data/processed/ux_dataset_clean.csv'
    df.to_csv(csv_path, index=False)
    print(f"Arquivo CSV criado com sucesso em: {csv_path}")
    
    try:
        excel_path = '../data/processed/ux_dataset_clean.xlsx'
        df.to_excel(excel_path, index=False)
        print(f"Arquivo Excel criado com sucesso em: {excel_path}")
    except ImportError:
        print("Para exportar para Excel, instale openpyxl com: pip install openpyxl")
except Exception as e:
    print(f"Erro ao salvar arquivos: {e}")

Arquivo CSV criado com sucesso em: ../data/processed/ux_dataset_clean.csv
Arquivo Excel criado com sucesso em: ./data/processed/ux_dataset_clean.xlsx
