In [None]:
# Dates
# Dates in pandas are represented by the Timestamp data type. We can use the pd.to_datetime() 
# function to convert strings to Timestamp objects. 
# This function is very flexible and can convert strings in many formats into Timestamp objects. 
# We can also specify the format that the dates are in, which can speed up the conversion process. Here's an example:
earthquakes.loc[3378, "Date"] = "02/23/1975"
earthquakes.loc[7512, "Date"] = "04/28/1985"
earthquakes.loc[20650, "Date"] = "03/13/2011"
earthquakes['date_parsed'] = pd.to_datetime(earthquakes["Date"], format="%m/%d/%Y")

# select the day of the month
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day

sns.distplot(day_of_month_earthquakes, kde=False, bins=31) # Plot the day of the month

In [None]:
# Missing values

missing_values_count = df_data.isnull().sum()
# how many total missing values do we have?
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

# remove all the rows that contain a missing value
df_data.dropna()

# remove all columns with at least one missing value
columns_with_na_dropped = df_data.dropna(axis=1)

# replace all NA's with 0
df_data.fillna(0)
# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0
df_data.fillna(method='bfill', axis=0).fillna(0) # Substitute the missing values with the next value in the column

In [None]:
# Scaling and Normalization

#imports
# for Box-Cox Transformation
from scipy import stats

# for min_max scaling
from mlxtend.preprocessing import minmax_scaling

scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1], kde=True, legend=False)
ax[1].set_title("Scaled data")
plt.show()



# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax=plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(normalized_data[0], ax=ax[1], kde=True, legend=False)
ax[1].set_title("Normalized data")
plt.show()


Scalling
![Imagem](Scalling.jpeg)


Normalization
![Exemplo de Imagem](Normalization.jpeg)

In [None]:
# Characters Encoding  

# modules we'll use
import pandas as pd
import numpy as np

# helpful character encoding module
import charset_normalizer

In [None]:
sample_entry = b'\xa7A\xa6n'
print(sample_entry)
print('data type:', type(sample_entry))

# changes the encoding from "big5-tw" to "utf-8"
before = sample_entry.decode("big5-tw")
new_entry = before.encode()


# read a file with encoding problems
files = pd.read_csv("...", encoding='FORMAT')
result = charset_normalizer.detect(rawdata.read(10000)) # Detect the encoding of the file
print(result)


# Save in CSV format
files.to_csv("my_file.csv")


b'\xa7A\xa6n'
data type: <class 'bytes'>
你好


In [None]:
# Inconsistent Data Entry

import fuzzywuzzy
from fuzzywuzzy import process
import charset_normalizer



array([' Germany', ' New Zealand', ' Sweden', ' USA', 'Australia',
       'Austria', 'Canada', 'China', 'Finland', 'France', 'Greece',
       'HongKong', 'Ireland', 'Italy', 'Japan', 'Macau', 'Malaysia',
       'Mauritius', 'Netherland', 'New Zealand', 'Norway', 'Pakistan',
       'Portugal', 'Russian Federation', 'Saudi Arabia', 'Scotland',
       'Singapore', 'South Korea', 'SouthKorea', 'Spain', 'Sweden',
       'Thailand', 'Turkey', 'UK', 'USA', 'USofA', 'Urbana', 'germany'],
      dtype=object)

# Germany and germany should be the same, ' New Zealand' and 'New Zealand' should be the same.

# Transformando os nomes e minusculos e removendo espaços, assim os nomes ficam padronizados e se juntam.
array = array.str.strip()
array = array.str.lower()

# Fuzzy matching: The process of automatically finding text strings that are very similar to the target string. 
# In general, a string is considered "closer" to another one the fewer characters you'd need to change 
# if you were transforming one string into another.

# South Korea and SouthKorea should be the same country.
# get the top 10 closest matches to "south korea"
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

    replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")

    array(['australia', 'austria', 'canada', 'china', 'finland', 'france',
       'germany', 'greece', 'hongkong', 'ireland', 'italy', 'japan',
       'macau', 'malaysia', 'mauritius', 'netherland', 'new zealand',
       'norway', 'pakistan', 'portugal', 'russian federation',
       'saudi arabia', 'scotland', 'singapore', 'south korea', 'spain',
       'sweden', 'thailand', 'turkey', 'uk', 'urbana', 'usa', 'usofa'],
      dtype=object)
