### **2nd place solution for Tuwaiq : Unlocking Potential for Elite Training Programs**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
import optuna

Columns

    Student ID: String - A unique identifier for each student.
    Age: Int - The age of the student.
    Gender: String - Gender of the student.
    Home Region: String - Student's home region.
    Home City: String - Student's home city.
    Program ID: String - Unique identifier for each program.
    Program Main Category Code: String - Main category of the program (Encoded).
    Program Sub Category Code: String - Sub-category of the program (Encoded).
    Technology Type: String - Type of technology used in the program.
    Program Skill Level: String - Skill level of the program.
    Program Presentation Method: String - Presentation method of the program (in-person or online).
    Program Start Date: Date - The date the program started.
    Program End Date: Date - The date the program ended.
    Program Days: Int - Count of days in the program.
    Completed Degree: String - Indicates if the student completed a university/college degree.
    Level of Education: String - Highest university/college degree received by the student.
    Education Specialty: String - University/college degree specialty.
    College: String - The student's university/college.
    University Degree Score: String - The score of the student in university/college.
    University Degree Score System: String - The scoring system used for the student's university/college score.
    Employment Status: String - Current employment status of the student.
    Job Type: String - Type of employment for the student.
    Still Working: String - Indicates if the student is currently working.
    Y (Target): Bool - Indicates if the student completed the program by achieving the minimum attendance percentage.(1 = Did not complete the program, 0 = Successfully completed the program)


In [2]:
pd.set_option('display.max_columns',  50)

In [3]:
reg_df = pd.read_csv('registration.csv')
sub_df = pd.read_csv('sample_submission.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
reg_df.head()

Unnamed: 0,Student ID,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration
0,0005f921-db87-47a3-af19-000332af236b,6,0,1,3,0,0,0,3,0,0,6,19
1,000f66b3-6ad7-4a6c-9f1f-0d34b005c5e6,0,0,7,0,0,0,0,1,0,0,0,8
2,001a2d9a-54ee-4ae9-91b4-d6fe03c98ce0,0,0,2,0,0,0,0,1,0,0,0,3
3,00235747-dd1a-4f59-be5b-4aeb66e037a1,0,0,1,0,0,0,0,0,0,0,0,1
4,00252da1-947b-4116-a096-852830ac0f89,0,0,1,1,0,0,0,1,0,0,7,10


In [5]:
common_ids = set(test_df['Student ID'].unique()).intersection(set(train_df['Student ID'].unique()))

In [6]:
if common_ids:
    print("There are common Student IDs between test_df and train_df.")
else:
    print("There are no common Student IDs between test_df and train_df.")


There are common Student IDs between test_df and train_df.


In [7]:
for st_id in train_df['Student ID'].values:
    if st_id in reg_df['Student ID'].values:
        continue
    else:
        print('Not in train set')

In [8]:
for st_id in test_df['Student ID'].values:
    if st_id in reg_df['Student ID'].values:
        continue
    else:
        print('Not in test set')

All Student IDs in train set or test set are in reg_df

#### **Inner Join with reg_df**

In [9]:
# If you want to perform a left join, keeping all rows from df1
merged_train = pd.merge(train_df, reg_df, on='Student ID', how='left')

In [10]:
merged_test = pd.merge(test_df, reg_df, on='Student ID', how='left')

In [11]:
merged_train.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Start Date,Program End Date,Program Days,Completed Degree,Level of Education,Education Speaciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,ذكر,منطقة الرياض,الرياض,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,,,حضوري,2023-05-28,2023-06-08,12,نعم,البكالوريوس,هندسة حاسب الالي,,2.44,4.0,غير موظف,,,0,4,0,0,0,0,0,0,0,0,0,0,4
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,ذكر,منطقة عسير,خميس مشيط,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,,متوسط,حضوري,2023-04-02,2023-04-06,5,نعم,البكالوريوس,الإذاعة والتلفزيون والفيلم,الفنون والعلوم الإنسانية,5.0,5.0,طالب,,,0,3,0,6,0,0,1,0,4,0,0,1,15
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,ذكر,منطقة الرياض,الرياض,e006900d-05a9-4c2b-a36f-0ffb9fce44cd,APMR,,,متوسط,حضوري,2023-07-23,2023-09-14,54,نعم,البكالوريوس,Information Technology,,3.5,5.0,موظف,,,0,4,1,7,1,0,0,0,0,0,0,0,13
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,ذكر,منطقة الرياض,الرياض,2ec15f6b-233b-428a-b9f5-e40bc8d14cf9,TOSL,TOSL,,,حضوري,2023-07-23,2023-08-24,33,نعم,البكالوريوس,حوسبة تطبيقية - (مسار شبكات الحاسب),,3.55,5.0,خريج,,,0,4,0,0,2,0,0,0,0,0,0,0,6
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,ذكر,منطقة الرياض,الرياض,d32da0e9-1aed-48c3-992d-a22f9ccc741e,CAUF,SWPS,تقليدية,متوسط,حضوري,2023-04-30,2023-06-22,54,لا,البكالوريوس,نظم المعلومات الحاسوبية,تكنولوجيا الاتصالات والمعلومات,4.0,5.0,,,,0,0,0,4,0,0,0,0,3,0,0,3,10


In [12]:
columns_to_check = ['PCRF', 'GRST', 'CAUF', 'INFA', 'ABIR', 'SERU', 'TOSL', 'APMR', 'DTFH', 'QWLM', 'N/A']

merged_train['Types_courses'] = (merged_train[columns_to_check] > 0).sum(axis=1)

In [13]:
merged_test['Types_courses'] = (merged_test[columns_to_check] > 0).sum(axis=1)

#### **Initial Pre processing**

In [14]:
merged_train['train'] = 1
merged_test['train'] = 0

In [15]:
df_concat = pd.concat([merged_train, merged_test], axis=0)

In [16]:
len(df_concat)

7366

In [17]:
df_concat['Home City'].value_counts()

الرياض             5319
جدة                 404
المدينة المنورة     143
الدمام              128
أبها                117
                   ... 
القريات               1
المجاردة              1
ابها                  1
العرضيات              1
رياض الخبراء          1
Name: Home City, Length: 97, dtype: int64

In [18]:
df_concat['Education Speaciality'] = df_concat['Education Speaciality'].str.lower()

In [19]:
encoder = OrdinalEncoder()

In [20]:
cols_to_encode = ['Gender',  'Home City',
              'Technology Type', 'Program Skill Level', 'Program Presentation Method',
              'Completed Degree',  'College', 
                'Job Type', ]

In [21]:
df_concat[cols_to_encode] = encoder.fit_transform(df_concat[cols_to_encode])

In [22]:
df_concat['Program ID'] = encoder.fit_transform(df_concat[['Program ID']])

In [23]:
df_concat.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Start Date,Program End Date,Program Days,Completed Degree,Level of Education,Education Speaciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration,Types_courses,train
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,منطقة الرياض,24.0,65.0,PCRF,PCRF,,,0.0,2023-05-28,2023-06-08,12,1.0,البكالوريوس,هندسة حاسب الالي,,2.44,4.0,غير موظف,,,0.0,4,0,0,0,0,0,0,0,0,0,0,4,1,1
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,منطقة عسير,66.0,180.0,APMR,SWPS,,2.0,0.0,2023-04-02,2023-04-06,5,1.0,البكالوريوس,الإذاعة والتلفزيون والفيلم,6.0,5.0,5.0,طالب,,,0.0,3,0,6,0,0,1,0,4,0,0,1,15,5,1
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,منطقة الرياض,24.0,197.0,APMR,,,2.0,0.0,2023-07-23,2023-09-14,54,1.0,البكالوريوس,information technology,,3.5,5.0,موظف,,,0.0,4,1,7,1,0,0,0,0,0,0,0,13,4,1
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,منطقة الرياض,24.0,43.0,TOSL,TOSL,,,0.0,2023-07-23,2023-08-24,33,1.0,البكالوريوس,حوسبة تطبيقية - (مسار شبكات الحاسب),,3.55,5.0,خريج,,,0.0,4,0,0,2,0,0,0,0,0,0,0,6,2,1
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,منطقة الرياض,24.0,188.0,CAUF,SWPS,0.0,2.0,0.0,2023-04-30,2023-06-22,54,0.0,البكالوريوس,نظم المعلومات الحاسوبية,8.0,4.0,5.0,,,,0.0,0,0,4,0,0,0,0,3,0,0,3,10,3,1


#### **Features for Program dates**

In [24]:
df_concat['Program Start Date'] = pd.to_datetime(df_concat['Program Start Date'])
df_concat['Program End Date'] = pd.to_datetime(df_concat['Program End Date'])

In [25]:
df_concat["start_day"] = df_concat["Program Start Date"].dt.day
df_concat["start_month"] = df_concat["Program Start Date"].dt.month
df_concat["start_year"] = df_concat["Program Start Date"].dt.year
df_concat["Start_Quarter"] = df_concat["Program Start Date"].dt.quarter
df_concat["Start_weekday"] = df_concat["Program Start Date"].dt.weekday

In [26]:
df_concat["end_day"] = df_concat["Program End Date"].dt.day
df_concat["end_month"] = df_concat["Program End Date"].dt.month
df_concat["end_year"] = df_concat["Program End Date"].dt.year
df_concat["end_Quarter"] = df_concat["Program End Date"].dt.quarter
df_concat["end_weekday"] = df_concat["Program End Date"].dt.weekday

In [27]:
# Dropping start and end date
df_concat.drop(['Program Start Date', 'Program End Date'], axis=1, inplace=True)

In [28]:
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7366 entries, 0 to 817
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Student ID                      7366 non-null   object 
 1   Age                             7260 non-null   float64
 2   Gender                          7366 non-null   float64
 3   Home Region                     7363 non-null   object 
 4   Home City                       7363 non-null   float64
 5   Program ID                      7366 non-null   float64
 6   Program Main Category Code      7366 non-null   object 
 7   Program Sub Category Code       6316 non-null   object 
 8   Technology Type                 4020 non-null   float64
 9   Program Skill Level             5520 non-null   float64
 10  Program Presentation Method     7366 non-null   float64
 11  Program Days                    7366 non-null   int64  
 12  Completed Degree                736

#### **Dropping columns with high frequency of NaNs**

In [29]:
df_concat.drop(['College', 'Job Type', 'Still Working', 'Education Speaciality'], axis=1, inplace=True)

#### **Imputing Employment Status**

In [30]:
df_concat['Employment Status'].value_counts()

موظف           3151
غير موظف       1247
طالب           1090
خريج           1071
موظف - طالب      99
عمل حر           72
Name: Employment Status, dtype: int64

In [31]:
imputer = SimpleImputer(strategy = 'most_frequent')

In [32]:
df_concat['Employment Status'] = imputer.fit_transform(df_concat[['Employment Status']])

In [33]:
df_concat['Employment Status'] = encoder.fit_transform(df_concat[['Employment Status']])

#### **Imputing University Degree Score and University Degree Score System**

In [34]:
df_concat[(df_concat['University Degree Score'].isna()) & (df_concat['University Degree Score System'].notna())]

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,University Degree Score,University Degree Score System,Employment Status,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday


In [35]:
df_concat[(df_concat['University Degree Score'].notna()) & (df_concat['University Degree Score System'].isna())]

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,University Degree Score,University Degree Score System,Employment Status,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday


In [36]:
df_concat['University Degree Score System'].value_counts()

5.0      5297
4.0      1595
100.0     382
Name: University Degree Score System, dtype: int64

In [37]:
df_concat['University Degree Score'].value_counts()

4.000     971
3.000     650
5.000     222
2.000     149
4.500     127
         ... 
99.830      1
89.690      1
1.790       1
3.498       1
3.506       1
Name: University Degree Score, Length: 446, dtype: int64

The most common University degree score system is the 5.0. 

In [38]:
df_concat['University Degree Score'][df_concat['University Degree Score System'] ==5].value_counts()

4.000    826
3.000    384
5.000    222
4.500    127
3.500     65
        ... 
2.150      1
2.880      1
2.690      1
2.710      1
4.061      1
Name: University Degree Score, Length: 279, dtype: int64

And the most common score for the 5.0 score system is 4. We shall impute the missing score system with 5.0 and the missing scores with 4.0

In [39]:
df_concat['University Degree Score System'].fillna(5.0, inplace=True)
df_concat['University Degree Score'].fillna(4.0, inplace=True)

In [40]:
df_concat['Normalized Uni Score'] = df_concat['University Degree Score'] / df_concat['University Degree Score System']
df_concat.drop(['University Degree Score' , 'University Degree Score System'], axis=1, inplace=True)

#### **Imputing Age**

We will impute the age of the missing values according to the median age of the gender.

In [41]:
df_concat[df_concat.Gender == 0]['Age'].median()

24.0

In [42]:
df_concat[df_concat.Gender == 1]['Age'].median()

26.0

In [43]:
df_concat.loc[(df_concat['Age'].isna()) &  
              (df_concat['Gender'] == 0) , 'Age'] = 24

In [44]:
df_concat.loc[(df_concat['Age'].isna()) &  
              (df_concat['Gender'] == 1) , 'Age'] = 26

#### **Imputing Level of Education**

In [45]:
df_concat['Level of Education'].value_counts()

البكالوريوس    6089
الماجستير       554
الدبلوم         354
ثانوي           304
الدكتوراه        35
متوسط             1
Name: Level of Education, dtype: int64

In [46]:
df_concat[['Completed Degree', 'Level of Education']].value_counts()

Completed Degree  Level of Education
1.0               البكالوريوس           4873
0.0               البكالوريوس           1216
1.0               الماجستير              447
                  الدبلوم                263
                  ثانوي                  211
0.0               الماجستير              107
                  ثانوي                   93
                  الدبلوم                 91
1.0               الدكتوراه               22
0.0               الدكتوراه               13
                  متوسط                    1
dtype: int64

We will impute the Level of Education with the most frequent value i.e. 0

In [47]:
imputer = SimpleImputer(strategy = 'most_frequent')

In [48]:
df_concat['Level of Education'] = imputer.fit_transform(df_concat[['Level of Education']])

In [49]:
df_concat['Level of Education'] = encoder.fit_transform(df_concat[['Level of Education']])

#### **Imputing Home Region**

In [50]:
imputer = SimpleImputer(strategy = 'most_frequent')

In [51]:
df_concat['Home Region'] = imputer.fit_transform(df_concat[['Home Region']])

In [52]:
df_concat['Home Region'] = encoder.fit_transform(df_concat[['Home Region']])

#### **Imputing Home City**

In [53]:
df_concat['Home City'].value_counts()

24.0    5319
61.0     404
40.0     143
21.0     128
0.0      117
        ... 
72.0       1
47.0       1
34.0       1
59.0       1
18.0       1
Name: Home City, Length: 97, dtype: int64

In [54]:
df_concat['Home City'] = df_concat['Home City'].fillna(24.0)

#### **Imputing Program Sub Category Code**

In [55]:
pmcc = df_concat.groupby('Program Main Category Code')

In [56]:
pmcc.get_group('APMR')['Program Sub Category Code'].value_counts()

SRTA    797
KLTM    132
SWPS    101
QTDY     57
ASCW     39
Name: Program Sub Category Code, dtype: int64

Most popular sub category for APMR program group is SRTA. We will impute the missing values for this sub code accordingly. 

In [57]:
pmcc.get_group('CAUF')['Program Sub Category Code'].value_counts()

SWPS    1904
CRDP     266
ERST     156
Name: Program Sub Category Code, dtype: int64

Most popular sub category for CAUF program group is SWPS. We will impute the missing values for this sub code accordingly. 

In [58]:
pmcc.get_group('TOSL')['Program Sub Category Code'].value_counts()

TOSL    327
Name: Program Sub Category Code, dtype: int64

Most popular sub category for TOSL program group is TOSL. We will impute the missing values for this sub code accordingly. 

In [59]:
pmcc.get_group('QWLM')['Program Sub Category Code'].value_counts()

Series([], Name: Program Sub Category Code, dtype: int64)

We have no indication of the program sub category code for QWLM program main code. We will impute missing values with most frequent value, i.e SWPS. 

In [60]:
pmcc.get_group('ABIR')['Program Sub Category Code'].value_counts()

INFA    121
ABIR     19
Name: Program Sub Category Code, dtype: int64

Most popular sub category for ABIR program group is INFA. We will impute the missing values for this sub code accordingly. 

In [61]:
pmcc.get_group('DTFH')['Program Sub Category Code'].value_counts()

Series([], Name: Program Sub Category Code, dtype: int64)

We have no indication of the program sub category code for DFTH program main code. We will impute missing values with most frequent value, i.e SWPS. 

In [62]:
# Fill NaN values in column Program Sub Category Code with 'SRTA' where column Program Main Category Code is equal to 'APMR'
df_concat.loc[df_concat['Program Main Category Code'] == 'APMR', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'APMR', 'Program Sub Category Code'].fillna('SRTA')

In [63]:
# Fill NaN values in column Program Sub Category Code with 'SWPS' where column Program Main Category Code is equal to 'CAUF'
df_concat.loc[df_concat['Program Main Category Code'] == 'CAUF', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'CAUF', 'Program Sub Category Code'].fillna('SWPS')

In [64]:
# Fill NaN values in column Program Sub Category Code with 'TOSL' where column Program Main Category Code is equal to 'TOSL'
df_concat.loc[df_concat['Program Main Category Code'] == 'TOSL', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'TOSL', 'Program Sub Category Code'].fillna('TOSL')

In [65]:
# Fill NaN values in column Program Sub Category Code with 'SWPS' where column Program Main Category Code is equal to 'QWLM'
df_concat.loc[df_concat['Program Main Category Code'] == 'QWLM', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'QWLM', 'Program Sub Category Code'].fillna('SWPS')

In [66]:
# Fill NaN values in column Program Sub Category Code with 'INFA' where column Program Main Category Code is equal to 'ABIR'
df_concat.loc[df_concat['Program Main Category Code'] == 'ABIR', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'ABIR', 'Program Sub Category Code'].fillna('INFA')

In [67]:
# Fill NaN values in column Program Sub Category Code with 'SWPS' where column Program Main Category Code is equal to 'DTFH'
df_concat.loc[df_concat['Program Main Category Code'] == 'DTFH', 'Program Sub Category Code'] = df_concat.loc[df_concat['Program Main Category Code'] == 'DTFH', 'Program Sub Category Code'].fillna('SWPS')

#### **Imputing Technology Type**

In [68]:
df_concat['Technology Type'].value_counts()

0.0    3048
2.0     489
1.0     483
Name: Technology Type, dtype: int64

In [69]:
pmcc_pscc = df_concat.groupby(['Program Main Category Code', 'Program Sub Category Code'])

In [70]:
pmcc_pscc['Technology Type'].value_counts()

Program Main Category Code  Program Sub Category Code  Technology Type
ABIR                        ABIR                       2.0                 19
                            INFA                       0.0                 88
                                                       2.0                 67
APMR                        ASCW                       0.0                 39
                            KLTM                       2.0                103
                            QTDY                       0.0                 57
                            SRTA                       0.0                432
                                                       2.0                 80
                                                       1.0                 23
CAUF                        CRDP                       1.0                139
                                                       2.0                 13
                            ERST                       0.0             

We will impute the technology type according to the program main category code most frequent tech type

In [71]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'ABIR') &  
              (df_concat['Program Sub Category Code'] == 'INFA'), 'Technology Type'] = 0.0

In [72]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'APMR') &  
              (df_concat['Program Sub Category Code'] == 'KLTM'), 'Technology Type'] = 2.0

In [73]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'APMR') &  
              (df_concat['Program Sub Category Code'] == 'SRTA'), 'Technology Type'] = 0.0

In [74]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'CAUF') &  
              (df_concat['Program Sub Category Code'] == 'CRDP'), 'Technology Type'] = 1.0

In [75]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'CAUF') &  
              (df_concat['Program Sub Category Code'] == 'SWPS'), 'Technology Type'] = 0.0

In [76]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'APMR') &  
              (df_concat['Program Sub Category Code'] == 'SWPS'), 'Technology Type'] = 0.0

In [77]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'GRST') &  
              (df_concat['Program Sub Category Code'] == 'INFA'), 'Technology Type'] = 0.0

In [78]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'INFA') &  
              (df_concat['Program Sub Category Code'] == 'INFA'), 'Technology Type'] = 0.0

In [79]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'PCRF') &  
              (df_concat['Program Sub Category Code'] == 'PCRF'), 'Technology Type'] = 0.0

In [80]:
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'TOSL') &  
              (df_concat['Program Sub Category Code'] == 'TOSL'), 'Technology Type'] = 0.0

In [81]:
# QWLM - SWPS has no technology type - will impute with most frequent overall
df_concat.loc[(df_concat['Technology Type'].isna()) &  
              (df_concat['Program Main Category Code'] == 'QWLM') &  
              (df_concat['Program Sub Category Code'] == 'SWPS'), 'Technology Type'] = 0.0

#### **Imputing Program Skill Level**

In [82]:
df_concat['Program Skill Level'].value_counts()

2.0    2416
0.0    2320
1.0     784
Name: Program Skill Level, dtype: int64

As all program skill levels who are NaN are not represented by any Program ID whose skill level we can match, we will use the category codes to infer skill level

In [83]:
df_concat[df_concat['Program Skill Level'].isna()]['Program ID'].value_counts()

109.0    53
154.0    52
93.0     51
43.0     48
132.0    47
         ..
217.0    16
131.0    16
155.0    15
16.0     11
200.0    10
Name: Program ID, Length: 73, dtype: int64

In [84]:
prog_skill = df_concat.groupby(['Program Main Category Code','Program Sub Category Code'])['Program Skill Level']

In [85]:
def get_skill(row):
    pmcc = row['Program Main Category Code']
    pscc = row['Program Sub Category Code']
    imputed_skill = prog_skill.get_group((pmcc, pscc)).dropna().values[0]
    return imputed_skill                                  

In [86]:
# Applying function to our dataset
df_concat['Program Skill Level'] = df_concat.apply(get_skill, axis=1)

#### **Creating feature based on applications made for same code**

In [87]:
df_concat.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,PCRF,PCRF,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,0,0,0,0,0,0,0,0,0,0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,APMR,SWPS,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,3,0,6,0,0,1,0,4,0,0,1,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,APMR,SRTA,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,4,1,7,1,0,0,0,0,0,0,0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,TOSL,TOSL,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,4,0,0,2,0,0,0,0,0,0,0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,CAUF,SWPS,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,0,0,4,0,0,0,0,3,0,0,3,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8


In [88]:
df_concat['already_applied'] = np.nan

In [89]:
def apply_values(df):
    # Extract the column names from 'Program Main Category Code' column
    categories = df['Program Main Category Code'].unique()

    # Iterate over each category
    for category in categories:
        # Get the column name based on the category
        column_name = df[category]

        # Update 'already_applied' column where 'Program Main Category Code' matches the category
        df.loc[df['Program Main Category Code'] == category, 'already_applied'] = column_name
        
    return df


In [90]:
df_concat.reset_index(drop=True, inplace=True)

In [91]:
df_concat = apply_values(df_concat)

In [92]:
df_concat.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score,already_applied
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,PCRF,PCRF,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,0,0,0,0,0,0,0,0,0,0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61,4.0
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,APMR,SWPS,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,3,0,6,0,0,1,0,4,0,0,1,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0,4.0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,APMR,SRTA,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,4,1,7,1,0,0,0,0,0,0,0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7,0.0
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,TOSL,TOSL,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,4,0,0,2,0,0,0,0,0,0,0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71,0.0
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,CAUF,SWPS,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,0,0,4,0,0,0,0,3,0,0,3,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8,4.0


In [93]:
df_concat.drop(['PCRF', 'GRST', 'CAUF', 'INFA', 'ABIR', 'SERU', 'TOSL', 'APMR', 'DTFH', 'QWLM', 'N/A'], axis=1, inplace=True)

#### **Combining Program codes**

In [94]:
df_concat.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score,already_applied
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,PCRF,PCRF,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61,4.0
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,APMR,SWPS,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0,4.0
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,APMR,SRTA,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7,0.0
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,TOSL,TOSL,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71,0.0
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,CAUF,SWPS,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8,4.0


In [95]:
df_concat['Program_code'] = df_concat['Program Main Category Code'] + df_concat['Program Sub Category Code']

In [96]:
df_concat.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score,already_applied,Program_code
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,PCRF,PCRF,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61,4.0,PCRFPCRF
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,APMR,SWPS,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0,4.0,APMRSWPS
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,APMR,SRTA,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7,0.0,APMRSRTA
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,TOSL,TOSL,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71,0.0,TOSLTOSL
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,CAUF,SWPS,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8,4.0,CAUFSWPS


In [97]:
# Dropping Program Main Category Code and Program Sub Category Code
df_concat.drop(['Program Main Category Code', 'Program Sub Category Code'], axis=1, inplace=True)

#### **Re-setting train and test sets**

In [98]:
new_train = df_concat[df_concat.train == 1]
new_test = df_concat[df_concat.train == 0]

In [99]:
new_train.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score,already_applied,Program_code
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61,4.0,PCRFPCRF
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0,4.0,APMRSWPS
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7,0.0,APMRSRTA
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71,0.0,TOSLTOSL
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8,4.0,CAUFSWPS


In [100]:
code_list = new_train['Program_code'].value_counts().index.tolist()

In [101]:
program_codes_list = set(code_list)

In [102]:
program_codes_dict = {}
for i,code in enumerate(program_codes_list):
    program_codes_dict[code] = i

In [103]:
new_train['Program_code'] = new_train['Program_code'].map(program_codes_dict)
new_test['Program_code'] = new_test['Program_code'].map(program_codes_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['Program_code'] = new_train['Program_code'].map(program_codes_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test['Program_code'] = new_test['Program_code'].map(program_codes_dict)


In [104]:
new_train.head()

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Technology Type,Program Skill Level,Program Presentation Method,Program Days,Completed Degree,Level of Education,Employment Status,Y,Total Regestration,Types_courses,train,start_day,start_month,start_year,Start_Quarter,Start_weekday,end_day,end_month,end_year,end_Quarter,end_weekday,Normalized Uni Score,already_applied,Program_code
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,1.0,4.0,24.0,65.0,0.0,0.0,0.0,12,1.0,0.0,3.0,0.0,4,1,1,28,5,2023,2,6,8,6,2023,2,3,0.61,4.0,2
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,1.0,10.0,66.0,180.0,0.0,2.0,0.0,5,1.0,0.0,1.0,0.0,15,5,1,2,4,2023,2,6,6,4,2023,2,3,1.0,4.0,5
2,38a11c0e-4afc-4261-9c64-e94cc0a272fb,24.0,1.0,4.0,24.0,197.0,0.0,2.0,0.0,54,1.0,0.0,4.0,0.0,13,4,1,23,7,2023,3,6,14,9,2023,3,3,0.7,0.0,6
3,1693e85b-f80e-40ce-846f-395ddcece6d3,23.0,1.0,4.0,24.0,43.0,0.0,1.0,0.0,33,1.0,0.0,0.0,0.0,6,2,1,23,7,2023,3,6,24,8,2023,3,3,0.71,0.0,10
4,98a0e8d0-5f80-4634-afd8-322aa0902863,23.0,1.0,4.0,24.0,188.0,0.0,2.0,0.0,54,0.0,0.0,4.0,0.0,10,3,1,30,4,2023,2,6,22,6,2023,2,3,0.8,4.0,14


#### **Setting training and test sets for ML**

In [105]:
X = new_train.drop(['Student ID', 'train', 'Y' ], axis=1)
y = new_train['Y'].values

X_test = new_test.drop(['Student ID', 'train', 'Y',], axis=1)

print(f'X Size = {X.shape}')
print(f'Test Size = {X_test.shape}')

X Size = (6548, 27)
Test Size = (818, 27)


In [106]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [107]:
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import f1_score

In [108]:
lgbm_params = {'num_leaves': 26, 
               'learning_rate': 0.04758279648208103, 
               'n_estimators': 452, 
               'lambda_l1': 0.013975208382661275, 
               'lambda_l2': 0.0465490931600607, 
               'max_depth': 14, 
               'colsample_bytree': 0.6587810846609027, 
               'subsample': 0.8038378821281886, 
               'min_child_samples': 11,
               "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "random_state": 554,
            'class_weight': 'balanced',
              }

In [109]:
gbc_params = {'n_estimators': 768,
 'learning_rate': 0.027749587812978364,
 'max_depth': 3,
 'min_samples_split': 19,
 'min_samples_leaf': 9,
              'random_state':554, 
              'verbose':0,
 'max_features': 'auto'}
 

In [110]:
cat_params = {'learning_rate': 0.049896793120466194,
              'iterations':1260,
              #'custom_metric':'F1',
              'depth':4,
              'l2_leaf_reg':3.745156035695952,
              'random_seed':554,
              'verbose':250,
              'thread_count': -1,
              'auto_class_weights':'Balanced',
              'border_count': 190,
            'eval_metric': 'F1'
             }



In [111]:
# Count the number of occurrences of 1s and 0s
num_ones = np.count_nonzero(y == 1)
num_zeros = np.count_nonzero(y == 0)

print("Number of 1s:", num_ones)
print("Number of 0s:", num_zeros)

Number of 1s: 1039
Number of 0s: 5509


In [112]:
ratio = num_zeros/num_ones

In [113]:
best_params_XGB = {'min_child_weight': 0,
 'learning_rate': 0.03142786116761427,
 'scale_pos_weight': ratio,
 'n_estimators': 1387,
 'max_depth': 5,
    'objective': 'binary:logistic',
 'subsample': 0.9246147944458577,
 'colsample_bytree': 0.3231773832609742,
 'random_state': 42,
 'gamma': 0.7024863569345223,
 'reg_alpha': 0.9856926371248441,
 'reg_lambda': 0.9926502539941398}

In [114]:
rf_params = {'n_estimators' : 300,
             'min_samples_split': 3, 
             'min_samples_leaf':2,
             'random_state':554,
             'class_weight':'balanced',
             'n_jobs': -1
            }

In [115]:
xgb = XGBClassifier(**best_params_XGB)
cat = CatBoostClassifier(**cat_params)
gbc = GradientBoostingClassifier(**gbc_params)
lgbm = LGBMClassifier(**lgbm_params)
rf = RandomForestClassifier(**rf_params)

In [116]:
def train_evaluate_model(X, y, X_test, model, splits: int):
    
       
    # use RepeatedStratified KFold
    skf = RepeatedStratifiedKFold(n_splits=splits, n_repeats=3, random_state=554)

    scores = 0
    predictions = []
    feature_importances = []
    best_thresholds = []

    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        
        #print('Fitting with k-fold', i+1, 'out of', splits)
        X_train = np.array(X)[train_index, :]
        y_train = y[train_index]
        
        X_val = np.array(X)[val_index, :]
        y_val = y[val_index]
        
        model.fit(X_train, y_train)

        y_pred_proba = model.predict_proba(X_val)[:,1]
        
       
        best_thresh = 0
        best_score = 0
        
        for thresh in np.arange(0.5,1.0,0.01):
            y_pred_val = y_pred_proba > thresh
            temp_score = f1_score(y_val, y_pred_val)
            
            if temp_score > best_score:
                best_score = temp_score
                best_thresh = thresh
        
        best_y_pred_val = y_pred_proba > best_thresh
        best_thresholds.append(best_thresh)
        
        fold_f1 = f1_score(y_val, best_y_pred_val)
        print(f'Fold F1 Score : {fold_f1} with threshold of {best_thresh}')
        scores += fold_f1/(splits*3)
            
    print(f'Overall F1 Score : {scores}')
    print(f'Overall average threshold : {np.mean(best_thresholds)}')


Here you can test each individual classifier accordingly. 

In [117]:
train_evaluate_model(X, y, X_test, xgb, 5)

Fold F1 Score : 0.6484018264840182 with threshold of 0.53
Fold F1 Score : 0.672811059907834 with threshold of 0.51
Fold F1 Score : 0.6448598130841121 with threshold of 0.5800000000000001
Fold F1 Score : 0.6480186480186481 with threshold of 0.5
Fold F1 Score : 0.6227272727272727 with threshold of 0.53
Fold F1 Score : 0.65625 with threshold of 0.5
Fold F1 Score : 0.6636363636363637 with threshold of 0.5
Fold F1 Score : 0.6276346604215457 with threshold of 0.5
Fold F1 Score : 0.6303317535545023 with threshold of 0.6200000000000001
Fold F1 Score : 0.6344827586206896 with threshold of 0.5
Fold F1 Score : 0.6547314578005116 with threshold of 0.6400000000000001
Fold F1 Score : 0.6650831353919241 with threshold of 0.6000000000000001
Fold F1 Score : 0.636144578313253 with threshold of 0.5700000000000001
Fold F1 Score : 0.6363636363636364 with threshold of 0.55
Fold F1 Score : 0.6130434782608696 with threshold of 0.51
Overall F1 Score : 0.6436346961723455
Overall average threshold : 0.5426666666

#### Catboost optuna

In [None]:
def objective(trial):
        
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-2, 100),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'thread_count': -1,  # Use all available threads
        'eval_metric': 'F1',  # Use F1 for evaluation
            'auto_class_weights':'Balanced',
            'random_seed':554,
        'verbose': False  # Suppress CatBoost output
    }

    cat = CatBoostClassifier(**params)
    cat_score = train_evaluate_model(X, y, X_test, cat, 5)
    
    return cat_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600)

In [None]:
study.best_params

#### Gradient Boosting Classifier Optuna

In [None]:
def objective_gbc(trial):
    
        # Define parameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'random_state': 554,
        'verbose': 0
    }
    
    gbc = GradientBoostingClassifier(**params)
    gbc_score = train_evaluate_model(X, y, X_test, gbc, 5)
    
    return gbc_score

In [None]:
study_gbc = optuna.create_study(direction='maximize')
study_gbc.optimize(objective_gbc, timeout=3600)

In [None]:
study_gbc.best_params

#### LGBM Optuna

In [None]:
def objective_lgbm(trial):

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 554,
        'class_weight': 'balanced',
        'num_leaves': trial.suggest_int("num_leaves", 25, 40),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.09),
        "n_estimators": trial.suggest_int("n_estimators", 400, 600),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
    }
        
        
    lgbm = LGBMClassifier(**params)
    lgbm_score = train_evaluate_model(X, y, X_test, lgbm, 5)
    
    return lgbm_score

In [None]:
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, timeout=3600)

#### Voting Classifier Final Predictions

In [118]:
def train_evaluate_model(X, y, X_test, clf1, clf2, clf3, clf4, splits: int):
    
       
    # use Stratified KFold
    skf = RepeatedStratifiedKFold(n_splits=splits, n_repeats=2, random_state=554)

    scores = 0
    predictions = []
    feature_importances = []

    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        
        print('Fitting with k-fold', i+1, 'out of', splits*2)
        X_train = np.array(X)[train_index, :]
        y_train = y[train_index]
        
        X_val = np.array(X)[val_index, :]
        y_val = y[val_index]
        
        model = VotingClassifier(estimators=[
         ('cat', clf1), ('gbc', clf2), ('lgbm', clf3), ('xgb', clf4)],
         voting='soft')

        model.fit(X_train, y_train) 

        y_pred_val = model.predict(X_val)
        
        fold_f1 = f1_score(y_val, y_pred_val)
        print(f'Fold F1 Score : {fold_f1}')
        scores += fold_f1/(splits*2)
        
        # store predictions
        y_pred_test_proba = model.predict_proba(X_test)
        predictions.append(y_pred_test_proba)
        print('Round', i+1, 'Predictions stored successfully')
        
            
    print(f'Overall F1 Score : {scores}')
    
    return predictions, feature_importances
        
        

In [119]:
vot_predictions, vot_feat_imp = train_evaluate_model(X, y, X_test, cat, gbc, lgbm, xgb, 5)

Fitting with k-fold 1 out of 10
0:	learn: 0.7536609	total: 83.7ms	remaining: 1m 45s
250:	learn: 0.8327454	total: 321ms	remaining: 1.29s
500:	learn: 0.8871793	total: 559ms	remaining: 847ms
750:	learn: 0.9128551	total: 792ms	remaining: 537ms
1000:	learn: 0.9394051	total: 1.06s	remaining: 274ms
1250:	learn: 0.9508989	total: 1.3s	remaining: 9.34ms
1259:	learn: 0.9515182	total: 1.31s	remaining: 0us
Fold F1 Score : 0.6510538641686182
Round 1 Predictions stored successfully
Fitting with k-fold 2 out of 10
0:	learn: 0.7496221	total: 1.19ms	remaining: 1.5s




250:	learn: 0.8327882	total: 240ms	remaining: 966ms
500:	learn: 0.8860625	total: 480ms	remaining: 727ms
750:	learn: 0.9167859	total: 716ms	remaining: 486ms
1000:	learn: 0.9363660	total: 955ms	remaining: 247ms
1250:	learn: 0.9482153	total: 1.19s	remaining: 8.58ms
1259:	learn: 0.9482153	total: 1.2s	remaining: 0us
Fold F1 Score : 0.6857142857142857
Round 2 Predictions stored successfully
Fitting with k-fold 3 out of 10
0:	learn: 0.7561939	total: 936us	remaining: 1.18s




250:	learn: 0.8391167	total: 241ms	remaining: 969ms
500:	learn: 0.8911609	total: 481ms	remaining: 729ms
750:	learn: 0.9117544	total: 725ms	remaining: 492ms
1000:	learn: 0.9360463	total: 961ms	remaining: 249ms
1250:	learn: 0.9517300	total: 1.2s	remaining: 8.63ms
1259:	learn: 0.9519420	total: 1.21s	remaining: 0us
Fold F1 Score : 0.6436781609195402
Round 3 Predictions stored successfully
Fitting with k-fold 4 out of 10
0:	learn: 0.7549336	total: 1.27ms	remaining: 1.6s




250:	learn: 0.8385653	total: 238ms	remaining: 957ms
500:	learn: 0.8903432	total: 478ms	remaining: 724ms
750:	learn: 0.9168859	total: 719ms	remaining: 487ms
1000:	learn: 0.9374442	total: 958ms	remaining: 248ms
1250:	learn: 0.9516952	total: 1.2s	remaining: 8.61ms
1259:	learn: 0.9521208	total: 1.21s	remaining: 0us
Fold F1 Score : 0.6447058823529412
Round 4 Predictions stored successfully
Fitting with k-fold 5 out of 10
0:	learn: 0.7488321	total: 1.22ms	remaining: 1.53s




250:	learn: 0.8337197	total: 246ms	remaining: 988ms
500:	learn: 0.8882072	total: 485ms	remaining: 735ms
750:	learn: 0.9204709	total: 719ms	remaining: 487ms
1000:	learn: 0.9398395	total: 956ms	remaining: 247ms
1250:	learn: 0.9519279	total: 1.2s	remaining: 8.61ms
1259:	learn: 0.9521412	total: 1.21s	remaining: 0us
Fold F1 Score : 0.6195899772209568
Round 5 Predictions stored successfully
Fitting with k-fold 6 out of 10
0:	learn: 0.7529186	total: 1.35ms	remaining: 1.7s




250:	learn: 0.8400411	total: 239ms	remaining: 961ms
500:	learn: 0.8848048	total: 474ms	remaining: 719ms
750:	learn: 0.9169106	total: 707ms	remaining: 479ms
1000:	learn: 0.9365663	total: 938ms	remaining: 243ms
1250:	learn: 0.9493963	total: 1.18s	remaining: 8.48ms
1259:	learn: 0.9507495	total: 1.19s	remaining: 0us
Fold F1 Score : 0.6514806378132119
Round 6 Predictions stored successfully
Fitting with k-fold 7 out of 10
0:	learn: 0.7591427	total: 1.06ms	remaining: 1.33s




250:	learn: 0.8297541	total: 236ms	remaining: 948ms
500:	learn: 0.8887335	total: 478ms	remaining: 724ms
750:	learn: 0.9202429	total: 712ms	remaining: 483ms
1000:	learn: 0.9345810	total: 949ms	remaining: 245ms
1250:	learn: 0.9445522	total: 1.19s	remaining: 8.54ms
1259:	learn: 0.9453895	total: 1.2s	remaining: 0us
Fold F1 Score : 0.6619385342789598
Round 7 Predictions stored successfully
Fitting with k-fold 8 out of 10
0:	learn: 0.7480467	total: 1.08ms	remaining: 1.36s




250:	learn: 0.8396466	total: 240ms	remaining: 967ms
500:	learn: 0.8898149	total: 480ms	remaining: 727ms
750:	learn: 0.9165237	total: 712ms	remaining: 483ms
1000:	learn: 0.9339376	total: 947ms	remaining: 245ms
1250:	learn: 0.9513227	total: 1.18s	remaining: 8.52ms
1259:	learn: 0.9520480	total: 1.19s	remaining: 0us
Fold F1 Score : 0.6298076923076923
Round 8 Predictions stored successfully
Fitting with k-fold 9 out of 10
0:	learn: 0.7496442	total: 1.49ms	remaining: 1.88s




250:	learn: 0.8371068	total: 249ms	remaining: 1s
500:	learn: 0.8915442	total: 487ms	remaining: 737ms
750:	learn: 0.9129825	total: 720ms	remaining: 488ms
1000:	learn: 0.9373376	total: 954ms	remaining: 247ms
1250:	learn: 0.9513214	total: 1.19s	remaining: 8.59ms
1259:	learn: 0.9524774	total: 1.2s	remaining: 0us
Fold F1 Score : 0.6292134831460674
Round 9 Predictions stored successfully
Fitting with k-fold 10 out of 10
0:	learn: 0.7534812	total: 1.3ms	remaining: 1.64s




250:	learn: 0.8367869	total: 246ms	remaining: 988ms
500:	learn: 0.8846497	total: 484ms	remaining: 734ms
750:	learn: 0.9133743	total: 719ms	remaining: 487ms
1000:	learn: 0.9390999	total: 953ms	remaining: 247ms
1250:	learn: 0.9507099	total: 1.19s	remaining: 8.58ms
1259:	learn: 0.9516479	total: 1.2s	remaining: 0us
Fold F1 Score : 0.631578947368421
Round 10 Predictions stored successfully
Overall F1 Score : 0.6448761465290694




#### Voting classifier

In [120]:
vot_preds = np.mean(vot_predictions, axis=0)

In [121]:
vot_preds

array([[0.1071144 , 0.8928856 ],
       [0.79125867, 0.20874133],
       [0.70543044, 0.29456956],
       ...,
       [0.96969515, 0.03030485],
       [0.01747665, 0.98252335],
       [0.91435489, 0.08564511]])

In [122]:
# Here you can choose an experiment with the probability threshold

final_preds = vot_preds[:,1] > 0.54

In [123]:
sub_df = pd.DataFrame(columns=['ID','Y'])

In [124]:
sub_df['ID'] = list(range(1,819))
sub_df['Y'] = final_preds.astype(int)

In [125]:
sub_df.head()

Unnamed: 0,ID,Y
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0


In [None]:
sub_df.to_csv('submission.csv', index=False)

In [None]:
len(sub_df)