In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MaxAbsScaler, PowerTransformer
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.mixture import BayesianGaussianMixture

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
df = pd.read_csv('../input/tabular-playground-series-jul-2022/data.csv')
df.head()

In [6]:
df.info()

# The data doesn't contain any null data.

In [7]:
df=df.drop('id',axis=1)

In [8]:
plt.figure(figsize = (24, 12))
sns.heatmap(df.corr(),annot=True,fmt=".2f", cmap='Blues')

In [9]:
df[['f_07', 'f_08', 'f_09', 'f_10', 'f_11', 'f_12', 'f_13']].describe()

# The columns from f_07 to f_13 are categorical. 

In [10]:
f_f = df.select_dtypes(include='float')
Cat_f = df.select_dtypes(include='int')

In [11]:
plt.figure(figsize = (24, 12))
sns.heatmap(f_f.corr(),annot=True,fmt=".2f", cmap='Blues').set_title('Continues features',size=20)

**Categorical values**

In [12]:
plt.figure(figsize = (24, 12))
sns.heatmap(Cat_f.corr(),annot=True,fmt=".2f", cmap='YlGnBu').set_title('Categorical features',size=20)

In [13]:
def my_boxplot(df):
    tmp_df = pd.DataFrame(data = df, columns = df.columns.to_list())
    plt.figure(figsize=(20,10)) 
    sns.boxplot(x="variable", y="value", data=pd.melt(tmp_df)).set_title('Boxplot of each feature',size=15)
    plt.show()
    
my_boxplot(df)

# Let's scale our features

In [14]:
MaxAbs =  MaxAbsScaler()
fff = MaxAbs.fit_transform(df)
fff= pd.DataFrame(fff, columns = df.columns)
fff.head()

In [15]:
my_boxplot(fff)

# Let's make our data more Gaussian-like.

In [16]:
Power_transformer = PowerTransformer()
fff = Power_transformer.fit_transform(fff)
fff= pd.DataFrame(fff, columns = df.columns)
my_boxplot(fff)

# The outliers could reduce the accuracy of a model, so let's replace these values to ~ 3*std.

In [17]:
def number_of_outliers(data):
    col = data.columns.to_list()
    for name in col:
        print(f'Текущий стоблец --{name}')
        mean = data[name].mean()
        std = data[name].std()
        print(f'Среднее текущего стоблца -- {mean}, std текущего столбца {std}')
        supremum = mean + 3 * std
        infimum = mean - 3 * std
        print(f'Супремум текущего стоблца -- {supremum}, инфимум текущего столбца {infimum}')
        for elem in data[name]:
            if elem > supremum:
                elem = supremum
            elif elem < infimum:
                elem = infimum  
    new_df = pd.DataFrame(data = data, columns = data.columns)
    return new_df
    

In [18]:
for col in fff.columns:
    fff[col]=fff[col].apply(lambda x:3.0 if x>3.0 else x)
    fff[col]=fff[col].apply(lambda x:-3.0 if x<-3.0 else x)

In [19]:
fff.describe()

In [20]:
my_boxplot(fff)

# We will determine the number of clusters based on the minimization of Inertia

In [21]:
def cluster(n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(fff)
    return kmeans

Inertia = []

for k in range(1, 15):
    kmeans = cluster(k)
    Inertia.append(kmeans.inertia_)

In [22]:
px.line(x=range(1,15), y=Inertia ,
       labels={'y':'Inertia', 'x':'Number of clusters'})

In [23]:
preds = BayesianGaussianMixture(n_components=7,
                                covariance_type='full',
                                max_iter=1000,
                                random_state=42).fit_predict(fff)

In [24]:
# Create submission file
sub = pd.read_csv('../input/tabular-playground-series-jul-2022/sample_submission.csv')
sub['Predicted']=preds
sub.to_csv('submission.csv',index=False)