# Classification

## Import libraries

In [1]:
#!pip install matplotlib seaborn scikit-learn

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline 

import warnings
warnings.filterwarnings('ignore')

In [3]:
from clickhouse_driver import Client


user_name = 'user'
pwd = 'password'

# creating connection ClickHouse
client = Client(host='clickhouse.lab.karpov.courses', port=9000,
                user=user_name, password=pwd, database='hardda')

# checking connection
result = client.execute("SELECT * FROM hardda.user_dm_events LIMIT 1")

# showing the result
for row in result[0:1]:
    print(row)

(datetime.date(2022, 2, 1), datetime.date(2022, 1, 31), 'android', 'f7411212fd0e2523e126cbfdd3f226c211212', '4beb10e1-aeeb-4c52-acd2-ce1ddbc1fc24b10e1', 22, 11, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0)


## Tasks

### Task 1.

 **Write an SQL query to load the data.**

In [4]:
query = '''
SELECT 
  la.*,
  up.user_type_cars_name
FROM 
  live_adverts la
LEFT JOIN 
  user_passports up ON la.passport_id = up.passport_id
WHERE 
  execution_date BETWEEN '2022-10-01' AND '2022-11-01'
    AND region = 'Красноярск'
      AND year <> '0'
        AND created_at < '2022-10-02'
'''

In [5]:
result = client.execute(query)

In [6]:
len(result)

87286

In [7]:
df = pd.DataFrame(result, 
                  columns=['execution_date', 'advert_id', 'created_at',
                           'price', 'region', 'user_id', 'platform',
                           'auto_brand', 'auto_model', 'passport_id',
                           'year', 'user_type', 'user_type_cars_name'])

df.head()

Unnamed: 0,execution_date,advert_id,created_at,price,region,user_id,platform,auto_brand,auto_model,passport_id,year,user_type,user_type_cars_name
0,2022-10-27,139107001,2014-04-18 06:30:07,1440000,Красноярск,123550057,desktop,Nissan,Patrol,123570989,2005,0,cars_simple
1,2022-10-27,140854575,2014-07-28 05:03:11,0,Красноярск,123927406,desktop,Unknown,Unknown model,123948401,2022,7,cars_dealer
2,2022-10-27,142255830,2014-10-21 17:43:15,12000000,Красноярск,123510982,unknown,Unknown,Unknown model,123531907,2012,0,cars_simple
3,2022-10-27,142758756,2014-11-19 07:55:28,10544000,Красноярск,124005693,desktop,Unknown,Unknown model,124026697,2022,7,cars_dealer
4,2022-10-27,143662578,2015-01-09 16:47:38,12000000,Красноярск,123510982,unknown,Unknown,Unknown model,123531907,2014,0,cars_simple


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87286 entries, 0 to 87285
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   execution_date       87286 non-null  object        
 1   advert_id            87286 non-null  int64         
 2   created_at           87286 non-null  datetime64[ns]
 3   price                87286 non-null  int64         
 4   region               87286 non-null  object        
 5   user_id              87286 non-null  int64         
 6   platform             87286 non-null  object        
 7   auto_brand           87286 non-null  object        
 8   auto_model           87286 non-null  object        
 9   passport_id          87286 non-null  int64         
 10  year                 87286 non-null  object        
 11  user_type            87286 non-null  int64         
 12  user_type_cars_name  87286 non-null  object        
dtypes: datetime64[ns](1), int64(5),

In [9]:
df['execution_date'] = pd.to_datetime(df['execution_date'], format='%Y-%m-%d')
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%d')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87286 entries, 0 to 87285
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   execution_date       87286 non-null  datetime64[ns]
 1   advert_id            87286 non-null  int64         
 2   created_at           87286 non-null  datetime64[ns]
 3   price                87286 non-null  int64         
 4   region               87286 non-null  object        
 5   user_id              87286 non-null  int64         
 6   platform             87286 non-null  object        
 7   auto_brand           87286 non-null  object        
 8   auto_model           87286 non-null  object        
 9   passport_id          87286 non-null  int64         
 10  year                 87286 non-null  object        
 11  user_type            87286 non-null  int64         
 12  user_type_cars_name  87286 non-null  object        
dtypes: datetime64[ns](2), int64(5),

In [11]:
df = df.sort_values(['execution_date', 'user_id', 'advert_id']).reset_index(drop=True)

In [12]:
df.head()

Unnamed: 0,execution_date,advert_id,created_at,price,region,user_id,platform,auto_brand,auto_model,passport_id,year,user_type,user_type_cars_name
0,2022-10-01,265755191,2022-09-16 15:14:35,9560000,Красноярск,123457700,ios,Kia,Carnival,123464228,2022,0,cars_simple
1,2022-10-01,265927691,2022-09-20 12:20:26,1900000,Красноярск,123460471,android,Unknown,Unknown model,123467852,2003,0,cars_simple
2,2022-10-01,226358865,2020-02-22 10:30:11,2000000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2007,0,cars_simple
3,2022-10-01,231423394,2020-07-29 09:25:16,2630000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2007,0,cars_simple
4,2022-10-01,242062459,2021-04-14 07:17:07,1530000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2002,0,cars_simple


Creating CHURN flags. 

In [13]:
churn_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    max_date = df[df['passport_id']== id_]['execution_date'].max()
    min_date = df[df['passport_id']== id_]['execution_date'].min()
    if (datetime(2022, 11, 1) - max_date > timedelta(days=14)) and (min_date == datetime(2022, 10, 1)):
        churn_dct[id_] = 1
    elif (max_date == datetime(2022, 11, 1)) and (min_date == datetime(2022, 10, 1)):
        churn_dct[id_] = 0
    else:
        churn_dct[id_] = 2

In [14]:
df['churn'] = df['passport_id'].map(churn_dct)

In [15]:
main_df = pd.DataFrame(list(churn_dct.items()), columns=['passport_id', 'churn'])

In [16]:
df = df[df['churn'] !=2]
main_df = main_df[main_df['churn'] !=2]

In [17]:
data = df[df['execution_date']== datetime(2022, 10, 1)].reset_index(drop=True)

In [18]:
data.head()

Unnamed: 0,execution_date,advert_id,created_at,price,region,user_id,platform,auto_brand,auto_model,passport_id,year,user_type,user_type_cars_name,churn
0,2022-10-01,265927691,2022-09-20 12:20:26,1900000,Красноярск,123460471,android,Unknown,Unknown model,123467852,2003,0,cars_simple,0
1,2022-10-01,226358865,2020-02-22 10:30:11,2000000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2007,0,cars_simple,0
2,2022-10-01,231423394,2020-07-29 09:25:16,2630000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2007,0,cars_simple,0
3,2022-10-01,242062459,2021-04-14 07:17:07,1530000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2002,0,cars_simple,0
4,2022-10-01,249980206,2021-10-06 06:19:18,720000,Красноярск,123462064,desktop,Unknown,Unknown model,123469843,2008,0,cars_simple,0


### Task 2. 

**Count the number of live ads for each user on the date execution_date=='2022-10-01'. How many live ads were there in total?**

In [19]:
opening_adverts_amount_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    adv_cnt = data[data['passport_id']== id_]['advert_id'].count()
    opening_adverts_amount_dct[id_] = adv_cnt

main_df['opening_adverts_amount'] = main_df['passport_id'].map(opening_adverts_amount_dct)

In [20]:
main_df

Unnamed: 0,passport_id,churn,opening_adverts_amount
1,123467852,0,1
2,123469843,0,4
3,123475067,0,2
4,123476026,0,1
5,123478402,1,2
...,...,...,...
4463,144333254,1,1
4464,144333596,0,1
4466,144333976,1,1
4467,144334504,1,1


In [21]:
main_df['opening_adverts_amount'].sum()

4503

### Task 3.  

**Calculate their average check (average ad price per user) with prices on execution_date=='2022-10-01'. What is the largest average check found in the resulting dataset?** 

In [22]:
price_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    avg_price = data[data['passport_id']== id_]['price'].mean()
    price_dct[id_] = avg_price

main_df['price'] = main_df['passport_id'].map(price_dct)

In [23]:
main_df.price.max()

24000000.0

### Task 4. 

**Calculate the average age of cars in years. What is the median of the average car age by passport_id?** 

In [24]:
age_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

data['year'] = data['year'].astype(int)
data['auto_age'] = 2022 - data['year'] 

for id_ in ids_lst:
    age = data[data['passport_id']== id_]['auto_age'].mean()
    age_dct[id_] = age

main_df['auto_age'] = main_df['passport_id'].map(age_dct)

In [25]:
main_df.auto_age.median()

14.0

### Task 5. 

**Calculate the average age of ads for a user on the date execution_date=='2022-10-01' in days. What is the largest average age of an ad found in the resulting dataset?**

In [26]:
days_dict = {} 
ids_lst = df['passport_id'].unique().tolist()

data['days_dif'] = (data['execution_date'] - data['created_at']).dt.days + 1

for id_ in ids_lst:
    days = data[data['passport_id']== id_]['days_dif'].mean()
    days_dict[id_] = days

main_df['advert_age'] = main_df['passport_id'].map(days_dict)

In [27]:
main_df['advert_age'].max()

2881.0

### Task 6. 

**Create a feature with the "most popular platform" for a user.**

In [28]:
platform_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    if len(data[data['passport_id']== id_]['platform'].value_counts()) > 1:
        if data[data['passport_id']== id_]['platform'].value_counts()[0] > data[data['passport_id']== id_]['platform'].value_counts()[1]:
            platform_dct[id_] = data[data['passport_id']== id_]['platform'].value_counts().idxmax()
        else:
            if 'android' in data[data['passport_id']== id_]['platform'].\
            value_counts()[data[data['passport_id']== id_]['platform'].value_counts()== \
                               data[data['passport_id']== id_]['platform'].value_counts()[0]].index:
                platform_dct[id_] = 'android'
            elif 'ios' in data[data['passport_id']== id_]['platform'].\
            value_counts()[data[data['passport_id']== id_]['platform'].value_counts()== \
                               data[data['passport_id']== id_]['platform'].value_counts()[0]].index:
                platform_dct[id_] = 'ios'
            elif 'desktop' in data[data['passport_id']== id_]['platform'].\
            value_counts()[data[data['passport_id']== id_]['platform'].value_counts()== \
                               data[data['passport_id']== id_]['platform'].value_counts()[0]].index:
                platform_dct[id_] = 'desktop'
            elif 'unknown' in data[data['passport_id']== id_]['platform'].\
            value_counts()[data[data['passport_id']== id_]['platform'].value_counts()== \
                               data[data['passport_id']== id_]['platform'].value_counts()[0]].index:
                platform_dct[id_] = 'unknown'
            else:
                platform_dct[id_] = 'mobile'
    else:
        platform_dct[id_] = data[data['passport_id']== id_]['platform'].value_counts().index.item()
    

main_df['platform'] = main_df['passport_id'].map(platform_dct)

In [29]:
main_df['platform'].value_counts()

android    2470
ios        1144
desktop     126
unknown       1
Name: platform, dtype: int64

### Task 7. 

**Create a feature that indicates whether a user has any ads for cars that are the most popular in terms of placement on the platform on the date execution_date=='2022-10-01'. A popular ad is considered to be the fact that the car model is included in the top 10 placed ads on the current date. How many passport_id have ads that are popular on the specified date?**

In [30]:
top_10 = data[data['auto_model'] != 'Unknown model']['auto_model'].value_counts().head(10).index

In [31]:
top_10

Index(['Camry', '2110 (седан)', '2114 (хэтчбек)', 'Passat', 'ГАЗель',
       'Granta 2190 (седан)', 'Priora 2170 (седан)', '2112 (хэтчбек)',
       '2121 Нива', 'Priora 2172 (хэтчбек)'],
      dtype='object')

In [32]:
top_10_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    models_for_id = data[data['passport_id'] == id_]['auto_model'].tolist()
    if any(model in top_10 for model in models_for_id):
        top_10_dct[id_] = 1
    else:
        top_10_dct[id_] = 0

main_df['is_top_model'] = main_df['passport_id'].map(top_10_dct)

In [33]:
main_df.is_top_model.sum()

1109

### Task 8. 

**What is the least popular user type? How many users are there?**

In [34]:
data.user_type.value_counts()

0    3992
6     286
7     117
4     108
Name: user_type, dtype: int64

In [35]:
data.user_type_cars_name.value_counts()

cars_simple    3627
cars_seller     518
                244
cars_dealer     114
Name: user_type_cars_name, dtype: int64

In [36]:
user_type_name_dct = {} 
ids_lst = df['passport_id'].unique().tolist()

for id_ in ids_lst:
    user_type_name_dct[id_] = data[data['passport_id'] == id_]['user_type_cars_name'].mode()[0]

main_df['user_type_cars_name'] = main_df['passport_id'].map(user_type_name_dct)

In [37]:
main_df.user_type_cars_name.value_counts()

cars_simple    3260
cars_seller     334
                109
cars_dealer      38
Name: user_type_cars_name, dtype: int64

### Task 9. 

**What column has missing values?**

In [38]:
main_df['user_type_cars_name'] = main_df['user_type_cars_name'].replace('', np.nan)

In [39]:
main_df['user_type_cars_name'].isna().sum()

109

### Task 10. 

**How balanced are the classes to be predicted? What minimum accuracy would we like to expect from the final model (round to hundredths)?**

In [40]:
main_df = main_df.sort_values('passport_id').reset_index(drop=True)

In [41]:
main_df

Unnamed: 0,passport_id,churn,opening_adverts_amount,price,auto_age,advert_age,platform,is_top_model,user_type_cars_name
0,123467852,0,1,1900000.0,19.0,11.00,android,0,cars_simple
1,123469843,0,4,1720000.0,16.0,660.25,desktop,0,cars_simple
2,123475067,0,2,10250000.0,7.5,632.00,android,0,cars_simple
3,123476026,0,1,2400000.0,14.0,2772.00,desktop,0,cars_simple
4,123478402,1,2,2600000.0,8.0,4.00,ios,1,cars_simple
...,...,...,...,...,...,...,...,...,...
3736,144333596,0,1,400000.0,7.0,0.00,android,0,cars_simple
3737,144333976,1,1,40000.0,19.0,0.00,ios,1,cars_simple
3738,144334504,1,1,84000.0,16.0,0.00,ios,1,cars_simple
3739,144334622,1,1,500000.0,23.0,0.00,android,0,cars_simple


In [42]:
X_ = main_df.drop(['churn', 'passport_id'], axis=1)
y_ = main_df['churn']

In [43]:
min_accuracy = sum(y_) / len(y_)
print(f'Minimal accuracy to expext: {round(min_accuracy, 2)}')

Minimal accuracy to expext: 0.78


### Task 11. 

**Prepare a validation strategy (take an ordinary KFold with parameters shuffle=True, random_state=42 and 3 folds). And then implement a basic transformer name, which:**

- Fills in the missing value with the most popular value.
- Applies OHE-encoding to categorical features.
- Does MinMax scaling.

In [44]:
splitter = KFold(
    n_splits=3,        
    shuffle=True,      
    random_state=42   
)

In [45]:
imputer_most_popular = ColumnTransformer(
    transformers=[
        ('SimpleImputer', SimpleImputer(strategy='most_frequent'), selector(dtype_exclude='number'))
    ],
    remainder='passthrough',          
    verbose_feature_names_out=False
).set_output(transform='pandas')  

In [46]:
ohe = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), selector(dtype_exclude='number'))
    ],
    remainder='passthrough',          
    verbose_feature_names_out=False
).set_output(transform='pandas')   

In [47]:
minmax_scaler = ColumnTransformer(
    transformers=[
        ('MinMaxScaler', MinMaxScaler(),selector(dtype_include='number'))
    ],
    remainder='passthrough',          
    verbose_feature_names_out=False
).set_output(transform='pandas')   

In [48]:
final_process_pipeline = Pipeline([
        ('imputer_most_popular', imputer_most_popular),
        ('ohe', ohe),
        ('minmax_scaler', minmax_scaler)
    ])

In [49]:
X_transformed = final_process_pipeline.fit_transform(X_)

In [50]:
X_transformed

Unnamed: 0,platform_desktop,platform_ios,platform_unknown,user_type_cars_name_cars_seller,user_type_cars_name_cars_simple,opening_adverts_amount,price,auto_age,advert_age,is_top_model
0,0.0,0.0,0.0,0.0,1.0,0.000000,0.079167,0.316667,0.003818,0.0
1,1.0,0.0,0.0,0.0,1.0,0.125000,0.071667,0.266667,0.229174,0.0
2,0.0,0.0,0.0,0.0,1.0,0.041667,0.427083,0.125000,0.219368,0.0
3,1.0,0.0,0.0,0.0,1.0,0.000000,0.100000,0.233333,0.962166,0.0
4,0.0,1.0,0.0,0.0,1.0,0.041667,0.108333,0.133333,0.001388,1.0
...,...,...,...,...,...,...,...,...,...,...
3736,0.0,0.0,0.0,0.0,1.0,0.000000,0.016667,0.116667,0.000000,0.0
3737,0.0,1.0,0.0,0.0,1.0,0.000000,0.001667,0.316667,0.000000,1.0
3738,0.0,1.0,0.0,0.0,1.0,0.000000,0.003500,0.266667,0.000000,1.0
3739,0.0,0.0,0.0,0.0,1.0,0.000000,0.020833,0.383333,0.000000,0.0


In [51]:
X_transformed.to_csv('X_transformed.csv', index=False)

### Task 12. 

tbc..