### Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
import os
import psutil

#### Helper funcs

In [None]:
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)

def get_cpu_usage():
    process = psutil.Process(os.getpid())
    return process.cpu_percent(interval=1)

#### Metrics

In [3]:
start = time.time()

start_mem = get_memory_usage()
start_cpu = psutil.cpu_percent(interval=1)
print(f"Start mem: {start_mem:.2f} MB")
print(f"Start cpu: {start_cpu:.2f} MB")

Start mem: 162.78 MB
Start cpu: 12.40 MB


### Load Datasets

In [4]:
start = time.time()
crime = pd.read_csv("datasets/state_crime.csv")
minimum_wage = pd.read_csv("datasets/min_wage.csv", encoding="cp1252")
wages_by_education = pd.read_csv("datasets/wages_by_education.csv")

### Tratamento

##### Valores Nulos

In [5]:
print("Valores nulos dataset crime: ", crime.isna().sum())

Valores nulos dataset crime:  State                            0
Year                             0
Data.Population                  0
Data.Rates.Property.All          0
Data.Rates.Property.Burglary     0
Data.Rates.Property.Larceny      0
Data.Rates.Property.Motor        0
Data.Rates.Violent.All           0
Data.Rates.Violent.Assault       0
Data.Rates.Violent.Murder        0
Data.Rates.Violent.Rape          0
Data.Rates.Violent.Robbery       0
Data.Totals.Property.All         0
Data.Totals.Property.Burglary    0
Data.Totals.Property.Larceny     0
Data.Totals.Property.Motor       0
Data.Totals.Violent.All          0
Data.Totals.Violent.Assault      0
Data.Totals.Violent.Murder       0
Data.Totals.Violent.Rape         0
Data.Totals.Violent.Robbery      0
dtype: int64


In [6]:
print("Valores nulos dataset minimum wage: ", minimum_wage.isna().sum())
minimum_wage = minimum_wage.drop(columns=["Footnote", "Department.Of.Labor.Uncleaned.Data"])

Valores nulos dataset minimum wage:  Year                                                      0
State                                                     0
State.Minimum.Wage                                        0
State.Minimum.Wage.2020.Dollars                           0
Federal.Minimum.Wage                                      0
Federal.Minimum.Wage.2020.Dollars                         0
Effective.Minimum.Wage                                    0
Effective.Minimum.Wage.2020.Dollars                       0
CPI.Average                                               0
Department.Of.Labor.Uncleaned.Data                        0
Department.Of.Labor.Cleaned.Low.Value                     0
Department.Of.Labor.Cleaned.Low.Value.2020.Dollars       15
Department.Of.Labor.Cleaned.High.Value                    0
Department.Of.Labor.Cleaned.High.Value.2020.Dollars      15
Footnote                                               2406
dtype: int64


In [7]:
print("Valores nulos dataset education: ", wages_by_education.isna().sum())

Valores nulos dataset education:  year                               0
less_than_hs                       0
high_school                        0
some_college                       0
bachelors_degree                   0
                                  ..
hispanic_women_less_than_hs        0
hispanic_women_high_school         0
hispanic_women_some_college        0
hispanic_women_bachelors_degree    0
hispanic_women_advanced_degree     0
Length: 61, dtype: int64


Valores nulos apenas no dataset sobre minimum_wage nas colunas

- Department.Of.Labor.Cleaned.Low.Value.2020.Dollars
- Department.Of.Labor.Cleaned.High.Value.2020.Dollars
- Footnote

No entantanto nesse dataset, valores 0 também são valores nulos



In [8]:
true_null = minimum_wage.replace(0, np.nan)
print("Valores nulos dataset minimum wage: ", true_null.isna().sum())

Valores nulos dataset minimum wage:  Year                                                     0
State                                                    0
State.Minimum.Wage                                     430
State.Minimum.Wage.2020.Dollars                        430
Federal.Minimum.Wage                                     0
Federal.Minimum.Wage.2020.Dollars                        0
Effective.Minimum.Wage                                   0
Effective.Minimum.Wage.2020.Dollars                      0
CPI.Average                                              0
Department.Of.Labor.Cleaned.Low.Value                  430
Department.Of.Labor.Cleaned.Low.Value.2020.Dollars     430
Department.Of.Labor.Cleaned.High.Value                 430
Department.Of.Labor.Cleaned.High.Value.2020.Dollars    430
dtype: int64


#### Substituir valores nulos - Interpolação

In [9]:
for col in ['State.Minimum.Wage', 'State.Minimum.Wage.2020.Dollars', 
            'Department.Of.Labor.Cleaned.Low.Value', 'Department.Of.Labor.Cleaned.Low.Value.2020.Dollars',
            'Department.Of.Labor.Cleaned.High.Value', 'Department.Of.Labor.Cleaned.High.Value.2020.Dollars',
            ]:
    # https://www.geeksforgeeks.org/pandas-dataframe-interpolate/
        minimum_wage[col] = minimum_wage[col].interpolate(method='linear', limit_direction='both')


In [10]:
print("Valores nulos dataset minimum wage: ", minimum_wage.isna().sum())


Valores nulos dataset minimum wage:  Year                                                   0
State                                                  0
State.Minimum.Wage                                     0
State.Minimum.Wage.2020.Dollars                        0
Federal.Minimum.Wage                                   0
Federal.Minimum.Wage.2020.Dollars                      0
Effective.Minimum.Wage                                 0
Effective.Minimum.Wage.2020.Dollars                    0
CPI.Average                                            0
Department.Of.Labor.Cleaned.Low.Value                  0
Department.Of.Labor.Cleaned.Low.Value.2020.Dollars     0
Department.Of.Labor.Cleaned.High.Value                 0
Department.Of.Labor.Cleaned.High.Value.2020.Dollars    0
dtype: int64


#### Normalizar

In [11]:
scaler = MinMaxScaler(feature_range=(0, 1))

# Normalizar as colunas númericas com exceção da coluna 'Year'
def normalize_numerical_columns(df):
    numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
    
    if 'Year' in numerical_columns:
        numerical_columns.remove('Year')
    
    df_scaled = df.copy()
    df_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    return df_scaled

minimum_wage_scaled = normalize_numerical_columns(minimum_wage)
crime_scaled = normalize_numerical_columns(crime)
wages_by_education_scaled = normalize_numerical_columns(wages_by_education)

In [12]:
print("Valores nulos dataset minimum wage: ", minimum_wage.isna().sum())


Valores nulos dataset minimum wage:  Year                                                   0
State                                                  0
State.Minimum.Wage                                     0
State.Minimum.Wage.2020.Dollars                        0
Federal.Minimum.Wage                                   0
Federal.Minimum.Wage.2020.Dollars                      0
Effective.Minimum.Wage                                 0
Effective.Minimum.Wage.2020.Dollars                    0
CPI.Average                                            0
Department.Of.Labor.Cleaned.Low.Value                  0
Department.Of.Labor.Cleaned.Low.Value.2020.Dollars     0
Department.Of.Labor.Cleaned.High.Value                 0
Department.Of.Labor.Cleaned.High.Value.2020.Dollars    0
dtype: int64


#### Merge

In [13]:
merged_data = pd.merge(crime, minimum_wage, on=["Year", "State"], how="outer")
merged_data = merged_data.dropna()
merged_data_csv = merged_data.to_csv("datasets/merged_data.csv", index=False)


#juntar ainda com o wages_by_education (com a coluna "Year")
merged_data = merged_data.rename(columns={'Year': 'year'})
merged_data = pd.merge(merged_data, wages_by_education, on=["year"], how="outer")
merged_data = merged_data.dropna()
merged_data_csv_2 = merged_data.to_csv("datasets/merged_data_2.csv", index=False)

end = time.time()
final = end - start

end_mem = get_memory_usage()
end_cpu = psutil.cpu_percent(interval=1)
memory_total = end_mem - start_mem
print(f"Execution time: {final} seconds")
print(f"Final memory: {end_mem:.2f}")
print(f"Final cpu: {end_cpu:.2f}")
print(f"Total memory: {memory_total:.2f} MB")
print(f"CPU: {get_cpu_usage():.2f}%")


Execution time: 0.2686188220977783 seconds
Final memory: 173.68
Final cpu: 7.00
Total memory: 10.91 MB
CPU: 0.00%


In [14]:
print("Valores nulos datasets: ", merged_data.isna().sum())


Valores nulos datasets:  State                              0
year                               0
Data.Population                    0
Data.Rates.Property.All            0
Data.Rates.Property.Burglary       0
                                  ..
hispanic_women_less_than_hs        0
hispanic_women_high_school         0
hispanic_women_some_college        0
hispanic_women_bachelors_degree    0
hispanic_women_advanced_degree     0
Length: 92, dtype: int64
