# Task 1

In [6]:
# EDA.ipynb

import sys
sys.path.append('../src')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from data_loader import DataLoader
from data_quality_check import DataQualityCheck
from data_clean_processing import DataCleanProcessing

# Load data
data_loader = DataLoader("../data/Apollo android review data.xlsx")
data = data_loader.load_data()

if data is not None:
    # Data quality check
    quality_check = DataQualityCheck(data)
    summary = quality_check.summary()
    print(summary)

    # Data cleaning and processing
    clean_process = DataCleanProcessing(data)
    cleaned_data = clean_process.process_data()

    # EDA
    # Example plot
    plt.figure(figsize=(10, 6))
    sns.histplot(cleaned_data['rating'], bins=20)
    plt.title('Distribution of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Frequency')
    plt.show()

    # More EDA code and visualizations
else:
    print("Failed to load data.")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9a in position 10: invalid start byte

## Load the Data

In [None]:
import sys
sys.path.append('../src')  # Add the src directory to the system path

from data_loader import DataLoader
from data_quality_check import DataQualityCheck
from data_clean_processing import DataCleanProcessing

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the data
file_path = '../data/MachineLearningRating_v3.txt'
data_loader = DataLoader(file_path)
data = data_loader.load_data()

In [None]:
# Display basic info and head of the data
data_loader.basic_info()
data_loader.display_head()

## Checking Missing Value

In [None]:
# Initialize and load data
data_quality = DataQualityCheck(file_path)
data = data_quality.load_data()
data_quality.basic_info()

## Data Clean Processing

In [None]:
# Initialize the DataCleanProcessing class with the loaded data
data_cleaner = DataCleanProcessing(data)

# Clean the data
cleaned_data = data_cleaner.clean_missing_values()

# Verify no missing values remain
if data_cleaner.verify_no_missing_values():
    print("Data cleaned successfully with no missing values remaining.")
else:
    print("There are still missing values in the data.")

In [None]:
# Display the shape of the dataframe after cleaning, to check any drop column.
cleaned_data.shape

In [None]:
# Display information about the dataframe
cleaned_data.info()

In [None]:
# Replace the original data with the cleaned data
data = cleaned_data

## EDA

### Univariate Analysis

#### Histograms for numerical columns:

In [None]:
import matplotlib.pyplot as plt

# Plot histograms for numerical columns
data.hist(bins=50, figsize=(20, 15))
plt.show()

#### Bar charts for categorical columns:

In [None]:
# Plot bar charts for categorical columns
for column in data.select_dtypes(include=['object']).columns:
    data[column].value_counts().plot(kind='bar', figsize=(10, 5))
    plt.title(column)
    plt.show()

### Bivariate Analysis

#### Scatter plots for relationships between numerical columns:

In [None]:
# Example scatter plot: TotalPremium vs TotalClaims
plt.scatter(data['TotalPremium'], data['TotalClaims'])
plt.xlabel('TotalPremium')
plt.ylabel('TotalClaims')
plt.title('TotalPremium vs TotalClaims')
plt.show()

## Correlation matrix:

In [None]:
# Select only numerical columns
numerical_data = data.select_dtypes(include=[float, int])

# Compute the correlation matrix
corr_matrix = numerical_data.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

## Outlier Detection

#### Box plots for outlier detection:

In [None]:
# Plot box plots for numerical columns to detect outliers
numerical_data.boxplot(column=['TotalPremium', 'TotalClaims'])
plt.show()

## Creating insightful and aesthetically pleasing visualizations

### Distribution of Total Premiums

In [None]:
# Distribution plot of Total Premium
plt.figure(figsize=(10, 6))
sns.histplot(data['TotalPremium'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Total Premiums')
plt.xlabel('Total Premium')
plt.ylabel('Frequency')
plt.show()

### Relationship Between Total Claims and Total Premiums

In [None]:
# Scatter plot with regression line for Total Claims vs Total Premiums
plt.figure(figsize=(10, 6))
sns.regplot(x='TotalPremium', y='TotalClaims', data=data, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Total Claims vs Total Premiums')
plt.xlabel('Total Premium')
plt.ylabel('Total Claims')
plt.show()

### Correlation Heatmap with Highlighted Key Variables

In [None]:
# Compute the correlation matrix
corr_matrix = data.select_dtypes(include=[float, int]).corr()

# Create a mask to display only the lower triangle of the heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap with the mask
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()