- 3.1 数据摘要和可视化
    - 数据摘要
        - 标称属性，给出每个可能取值的频数
        - 数值属性，给出5数概括及缺失值的个数
    - 数据可视化
        - 使用直方图、盒图等检查数据分布及离群点
- 3.2 数据缺失的处理
    - 观察数据集中缺失数据，分析其缺失的原因。分别使用下列四种策略对缺失值进行处理:
        - 将缺失部分剔除
        - 用最高频率值来填补缺失值
        - 通过属性的相关关系来填补缺失值
        - 通过数据对象之间的相似性来填补缺失值
    - 注意：在处理后完成，要对比新旧数据集的差异。

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from impyute.imputation.cs import fast_knn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor


pd.set_option('display.max_rows', None)  # 设置行不限制数量
pd.set_option('display.max_columns', None)  # 设置列不限制数量
pd.set_option('max_colwidth', 100)  # 设置value的显示长度为100，默认为50

plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
'''
数据摘要
    标称属性，给出每个可能取值的频数
    数值属性，给出5数概括及缺失值的个数
'''


def nominal_summary(df: pd.DataFrame, nominal_index, new_df=None, n=50):
    '''标称属性 给出每个可能取值的频数 可视化直方图'''
    for key in nominal_index:
        data = df[key].value_counts()
        if new_df is None:
            plt.figure(figsize=(11, 6))
            plt.title(data.name, fontsize=30)
            plt.bar(data.index[:n], data.values[:n], width=0.8)
            plt.xticks(rotation=90)
            plt.show()
        else:
            plt.figure(figsize=(22, 6))
            plt.subplot(121)
            plt.title(data.name, fontsize=30)
            plt.bar(data.index[:n], data.values[:n], width=0.8, label=data.name)
            plt.xticks(rotation=90)
            plt.subplot(122)
            new_data = new_df[key].value_counts()
            plt.title(new_data.name, fontsize=30)
            plt.bar(new_data.index[:n], new_data.values[:n], width=0.8, label='processed '+new_data.name)
            plt.xticks(rotation=90)
            plt.legend()
            plt.show()


def numerical_summary(df: pd.DataFrame, numerical_index, new_df=None):
    '''数值属性 给出五数概括及缺失值个数'''
    for key in numerical_index:
        shape = df.shape
        data = df[key]
        info = data.describe()
        print('descriptive statistics ({}):'.format(data.name))
        # 五数概括，直接利用函数计算  缺失值为总行数-有值得行数
        print("          Data: Min: {:< 2.4f}\tQ1(25%): {:< 2.4f}\tQ2(50%): {:< 2.4f}\tQ3(75%): {:< 2.4f}\tMax: {:< 2.4f}\tMissing: {:d}".format(
            info['min'], info['25%'], info['50%'], info['75%'], info['max'], int(shape[0] - info['count'])
        ))
        if new_df is not None:
            new_shape = new_df.shape
            new_data = new_df[key]
            new_info = new_data.describe()
            print("Processed Data: Min: {:< 2.4f}\tQ1(25%): {:< 2.4f}\tQ2(50%): {:< 2.4f}\tQ3(75%): {:< 2.4f}\tMax: {:< 2.4f}\tMissing: {:d}".format(
                new_info['min'], new_info['25%'], new_info['50%'], new_info['75%'], new_info['max'], int(new_shape[0] - new_info['count'])
            ))

In [None]:
"""
数据可视化
    使用直方图、盒图等检查数据分布及离群点
"""


def box_plot(df, label=None, new_df=None):
    if new_df is None:
        plt.figure()
        plt.title('Data')
        sns.boxplot(y=label, data=df, palette='Set2')
    else:
        plt.figure(figsize=(14, 8))
        plt.subplot(121)
        plt.title('Data')
        sns.boxplot(y=label, data=df, palette='Set1')
        plt.subplot(122)
        plt.title('Processed Data')
        sns.boxplot(y=label, data=new_df, palette='Set2')


def hist_plot(df, label=None, bins=10, new_df=None):
    if new_df is None:
        plt.figure()
        plt.title('Data')
        sns.histplot(df[label].dropna(), bins=bins, kde=False)
    else:
        plt.figure(figsize=(14, 8))
        plt.subplot(121)
        plt.title('Data')
        sns.histplot(df[label].dropna(), bins=bins, kde=False)
        plt.subplot(122)
        plt.title('Processed Data')
        sns.histplot(new_df[label].dropna(), bins=bins, kde=False)


def viz_pairs(df, labels=None, bins=20, new_df=None):
    for label in labels:
        box_plot(df, label=label, new_df=new_df)
        hist_plot(df, label=label, bins=bins, new_df=new_df)

In [None]:
"""
数据缺失值处理
    将缺失部分剔除
    用最高频率值来填补缺失值
    通过属性的相关关系来填补缺失值
    通过数据对象之间的相似性来填补缺失值
"""


def remove_missing_parts(df: pd.DataFrame):
    '''将缺失部分剔除'''
    return df.dropna(axis=0)


def fill_in_missing_values_with_the_highest_frequency_value(df: pd.DataFrame):
    '''用最高频率值来填补缺失值'''
    cpdf = df.copy(deep=True)
    for key in cpdf.columns:
        cpdf[key].fillna(cpdf[key].mode()[0], inplace=True)
    return cpdf


def fill_in_missing_values_through_attribute_correlation(df: pd.DataFrame, miss_index, complete_index):
    '''通过属性的相关关系来填补缺失值'''
    
    def set_miss_values(df: pd.DataFrame, complete_index):
        enc_label = OrdinalEncoder()
        enc_feature = OrdinalEncoder()
        missing_index = complete_index[0]
        train_df = df[complete_index]

        known_values = np.array(train_df[train_df[missing_index].notnull()])
        unknow_values = np.array(train_df[train_df[missing_index].isnull()])
 
        y = known_values[:, 0].reshape(-1, 1)
        enc_label.fit(y)
        y = enc_label.transform(y)

        X = known_values[:, 1:]
        test_X = unknow_values[:, 1:]
        all_X = np.row_stack((X, test_X))
        enc_feature.fit(all_X)
        X = enc_feature.transform(X)

        rfr = RandomForestRegressor(random_state=0, n_estimators=20, n_jobs=10)
        rfr.fit(X, y.ravel())

        predicted_values = rfr.predict(enc_feature.transform(unknow_values[:, 1:]))
        predicted_values = enc_label.inverse_transform(predicted_values.reshape(-1, 1))

        df.loc[(df[missing_index].isnull()), missing_index] = predicted_values
        return df
    
    cpdf = df.copy(deep=True)
    for i in range(0, len(miss_index)):
        complete_index.insert(0, miss_index[i])
        cpdf = set_miss_values(cpdf, complete_index)
    return cpdf


def fill_in_missing_values_through_similarity_between_data_objects(df: pd.DataFrame, numerical_index, k=30):
    '''通过数据对象之间的相似性来填补缺失值'''
    cpdf = df.copy(deep=True)
    imputed_training = fast_knn(cpdf[numerical_index].values, k=k)
    imputed_training = pd.DataFrame(data=imputed_training, columns=numerical_index)
    cpdf[numerical_index] = imputed_training[numerical_index]
    return cpdf


# [Alzheimer Disease and Healthy Aging Data in US](https://www.kaggle.com/datasets/ananthu19/alzheimer-disease-and-healthy-aging-data-in-us)

- The Alzheimer's Disease and Healthy Aging Data in the US dataset is a comprehensive collection of data on the health and well-being of older Americans. The dataset includes information on a wide range of variables, such as demographic characteristics, health conditions, healthcare utilization, and health behaviors.

- The dataset was compiled by the Centers for Disease Control and Prevention (CDC) and includes data from several national surveys, including the Behavioral Risk Factor Surveillance System (BRFSS), the National Health and Nutrition Examination Survey (NHANES), and the National Health Interview Survey (NHIS).

- The primary focus of the dataset is on Alzheimer's disease and related dementias, including prevalence, incidence, risk factors, and outcomes.The data can be used to identify trends and patterns in the prevalence and incidence of these conditions, as well as to explore potential risk factors and interventions that may help to prevent or mitigate their impact.

In [None]:
path = 'Alzheimer Disease and Healthy Aging Data In US.csv'
data = pd.read_csv(path, sep=',')
# Sample_Size全是空，因此直接将其删掉
data = data.drop(columns=["Sample_Size"])
# 处理异常值
data['Low_Confidence_Limit'] = data['Low_Confidence_Limit'].apply(lambda x: None if x == '.' else float(x))
data['High_Confidence_Limit'] = data['High_Confidence_Limit'].apply(lambda x: None if x == '.' else float(x))
data.info()

## 数据摘要和可视化

In [None]:
# 标称属性
nominal_index = ['LocationAbbr', 'LocationDesc', 'Datasource', 'Class', 'Topic', 'Question', 'Data_Value_Unit', 'DataValueTypeID', 'Data_Value_Type', 'StratificationCategory1', 'Stratification1', 'StratificationCategory2', 'Stratification2', 'Geolocation', 'ClassID', 'TopicID', 'QuestionID', 'StratificationCategoryID1', 'StratificationID1', 'StratificationCategoryID2', 'StratificationID2']
# 数值属性
numerical_index = ['YearStart', 'YearEnd', 'Data_Value', 'Data_Value_Alt', 'Low_Confidence_Limit', 'High_Confidence_Limit', 'LocationID']

### 标称属性

In [None]:
nominal_summary(data, nominal_index=nominal_index)

### 数值属性

- Data_Value, Data_Value_Alt, Low_Confidence_Limit, High_Confidence_Limit属性都有很多的缺失
- LocationID虽然没有缺失的数据点，但是明显存在离群点

In [None]:
numerical_summary(data, numerical_index=numerical_index)

In [None]:
# 数据可视化：使用直方图、盒图等检查数据分布及离群点
viz_pairs(data, labels=numerical_index, bins=20, new_df=None)

## 数据缺失的处理

### 将缺失部分剔除

- 凡是带有缺失属性的数据一律删除
- 虽然数据量大幅缩小，但是确实去除了一些噪点，比如LocationID中的离群点消失了

In [None]:
new_data = remove_missing_parts(data)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 用最高频率值来填补缺失值

- 简单粗暴，将所有缺失值用其所在的数据中出现次数最多的数值进行填充，可以将数据恢复满
- 在某些情况下不符合实际情况，可能会改变原有的数据分布，并且离群点不会消失

In [None]:
new_data = fill_in_missing_values_with_the_highest_frequency_value(data)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 通过属性的相关关系来填补缺失值

- 训练一个模型，根据非缺失值来预测缺失值，如果能学到潜在的属性相关关系将会非常有效
- 花费时间较长，且无法保证训练出的模型预测值一定准确，离群点不会消失

In [None]:
miss_index = [k for k, v in data.isna().sum().items() if v > 0]
comp_index = [k for k, v in data.isna().sum().items() if v == 0]
new_data = fill_in_missing_values_through_attribute_correlation(data, miss_index=miss_index, complete_index=comp_index)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 通过数据对象之间的相似性来填补缺失值

- 相似性即距离的度量，使用k近邻策略修补缺失值
- 由于距离度量计算的原因，部分缺失值可能找不到与其相似的值来填补

In [None]:
new_data = fill_in_missing_values_through_similarity_between_data_objects(data, numerical_index=numerical_index, k=100)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

# [Movies Dataset from Pirated Sites](https://www.kaggle.com/datasets/arsalanrehman/movies-dataset-from-piracy-website)

- This dataset has been gathered from a pirated website that has a user base of around 2M visitors per month. This data contains more than 20,000+ movies from all industries such as Hollywood, Bollywood, Anime, etc.
- Data Fields
    - id: movie's unique id
    - title: movie's name
    - storyline: a short description of the movie
    - views: no. of clicks per movie
    - downloads: no. of downloads per movie
    - IMDb-rating: rating
    - appropriate_for: R-rated, PG-13, etc
    - language: this can be multiple languages also
    - industry: Hollywood, Bollywood, etc.
    - posted_date: when the movie is posted on the platform
    - release_date: when the movie is released worldwide
    - runtime: in minutes
    - director: director's name
    - writer: list of all the writers

In [None]:
path = 'movies_dataset.csv'
data = pd.read_csv(path, sep=',')
# Unnamed: 0无意义，因此直接将其删掉
data = data.drop(columns=["Unnamed: 0"])
data['downloads'] = data['downloads'].apply(lambda x: float(x.replace(',', '')) if isinstance(x, str) else x)
data['views'] = data['views'].apply(lambda x: float(x.replace(',', '')) if isinstance(x, str) else x)
# data['posted_date'] = pd.to_datetime(data['posted_date'])
# data['release_date'] = pd.to_datetime(data['release_date'])
def f(x):
    if x == x:
        y = x.replace(' ', '').replace('min', '').replace('m', '').split('h')
        t = [60, 1]
        ans = 0
        for i in range(len(y)):
            if len(y[len(y) - 1 - i]) > 0:
                ans += t[1 - i] * int(y[len(y) - 1 - i])
        return ans
    else:
        return x
data['run_time'] = data['run_time'].apply(f)
data.info()

In [None]:
data.head()

## 数据摘要和可视化

In [None]:
# 标称属性
nominal_index = ['appropriate_for', 'director', 'industry', 'language', 'posted_date', 'release_date', 'storyline', 'title', 'writer']
# 数值属性
numerical_index = ['IMDb-rating', 'id', 'downloads', 'run_time', 'views']

### 标称属性

In [None]:
nominal_summary(data, nominal_index=nominal_index)

### 数值属性

- 数据量虽然少，但是不少属性都存在大量离群点

In [None]:
numerical_summary(data, numerical_index=numerical_index)

In [None]:
# 数据可视化：使用直方图、盒图等检查数据分布及离群点
viz_pairs(data, labels=numerical_index, bins=20, new_df=None)

## 数据缺失的处理

- 实现与细节的讨论与上一数据集相似

### 将缺失部分剔除

In [None]:
new_data = remove_missing_parts(data)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 用最高频率值来填补缺失值

In [None]:
new_data = fill_in_missing_values_with_the_highest_frequency_value(data)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 通过属性的相关关系来填补缺失值

In [None]:
miss_index = [k for k, v in data.isna().sum().items() if v > 0]
comp_index = [k for k, v in data.isna().sum().items() if v == 0]
new_data = fill_in_missing_values_through_attribute_correlation(data, miss_index=miss_index, complete_index=comp_index)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)

### 通过数据对象之间的相似性来填补缺失值

In [None]:
new_data = fill_in_missing_values_through_similarity_between_data_objects(data, numerical_index=numerical_index, k=10)
new_data.info()

In [None]:
nominal_summary(data, nominal_index, new_data)

In [None]:
numerical_summary(data, numerical_index=numerical_index, new_df=new_data)

In [None]:
viz_pairs(data, labels=numerical_index, bins=20, new_df=new_data)