## Data structure

In [None]:
import pandas as pd

df = pd.read_csv("data/rawdata.csv")

print(f"Dataframe values types: {df.info()}")
print(f"Description of dataframe: {df.describe()}")
print(f"Number of unique values: {df.nunique()}")
print(f"Number of duplicated rows: {df.duplicated().sum()}")

## Checking missing values

In [None]:
df.isnull().sum().sort_values(ascending=False)

## Checking if the dataset is balanced

In [None]:
df['Churn'].value_counts(normalize=True).plot(kind='bar')

## Feature distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for feature in df.columns:
    sns.histplot(data=df,x=feature)
    plt.title(f"Distribution of feature: {feature}")
    plt.xlabel(feature)
    plt.ylabel("frequency")
    plt.show()

## Dependence of numerical features on churn

In [None]:
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.show()
sns.boxplot(x='Churn', y='tenure', data=df)
plt.show()

## Categorical features vs Chrun

In [None]:
pd.crosstab(df['InternetService'], df['Churn'], normalize='index').plot(kind='bar', stacked=True)

## Correlation heatmap

In [None]:
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True, cmap='coolwarm')