In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e1/sample_submission.csv
/kaggle/input/playground-series-s6e1/train.csv
/kaggle/input/playground-series-s6e1/test.csv


In [22]:
from sklearn.model_selection import train_test_split

#Accessing the training and and testing datasets
X_data =pd.read_csv('/kaggle/input/playground-series-s6e1/train.csv')
X_data_test = pd.read_csv('/kaggle/input/playground-series-s6e1/test.csv')

#selecting the target for the predictions
y = X_data['exam_score'].copy()

#Selecting the prospective predictors for exam scores
features = ['age', 'gender', 'course','study_hours','class_attendance','internet_access','sleep_hours','sleep_quality','study_method','facility_rating']
X = X_data[features].copy()
X_test = X_data_test[features].copy()

#Splitting the data into the their respective training and validation sets. 
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size =0.8, test_size=0.2, random_state= 0)



In [57]:
#Selecting desired numerical and categorical columns
categorical_cols2 = [colname for colname in X_train.columns if 
                   X_train[colname].nunique()<8 and
                   X_train[colname].dtype=='object']
numerical_cols2 = [colname for colname in X_train.columns if 
                  X_train[colname].dtype in ['int64', 'float64']]
full_cols = categorical_cols2 + numerical_cols2
X_train_full = X_train[full_cols].copy()
X_valid_full= X_valid[full_cols].copy()
X_test_full = X_test[full_cols].copy()

In [58]:
#Importing the OneHotEncoder and the SimpleImputer for imputing and encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


In [59]:
#excluding outliers
y_other = y_train  

q1, q3 = np.percentile(y_other, [25, 75])
iqr = q3 - q1

lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

outlier_mask = (y_other < lower) | (y_other> upper)
x_train_clean =X_train_full.loc[~outlier_mask]
y_train_clean =y_other.loc[~outlier_mask]

In [67]:
#OneHotEncoding
#Encoder

#selecting potential data leaks and removing the features that could cause data leakage
potential_leaks= ['exam_difficulty', 'age', 'course', 'internet_activity','gender']
X_train_drop = x_train_clean.drop(columns = [i for i in potential_leaks2 if i in x_train_clean.columns])
X_valid_drop = X_valid_full.drop(columns = [i for i in potential_leaks2 if i in X_valid_full.columns])
X_test_drop =X_test_full.drop(columns =[i for i in potential_leaks2 if i in X_test_full.columns])
s = (X_train_drop.dtypes == 'object')
object_cols2 = list(s[s].index)
t = (X_valid_drop.dtypes == 'object')
object_cols3 = list(t[t].index)

#OneHotEcoder on the data without data leaks
OH = OneHotEncoder(handle_unknown= "ignore", sparse_output = False)
OH_X_train_cols = pd.DataFrame(OH.fit_transform(X_train_drop[object_cols2]))
OH_X_valid_cols = pd.DataFrame(OH.transform(X_valid_drop[object_cols3]))

OH_X_train_cols.index = X_train_drop.index
OH_X_valid_cols.index = X_valid_drop.index

drop_Xt = X_train_drop.drop(object_cols2, axis=1)
drop_Xv = X_valid_drop.drop(object_cols3, axis=1)

OH_X_train = pd.concat([drop_Xt, OH_X_train_cols], axis=1)
OH_X_valid = pd.concat([drop_Xv, OH_X_valid_cols], axis =1) 

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns =OH_X_valid.columns.astype(str)

OH_X_test =pd.get_dummies(X_test_drop)
OH_X_test = OH_X_test.reindex(columns =OH_X_train.columns, fill_value =0)


#SimpleImputer on the data 
s_i = SimpleImputer(strategy ='most_frequent')
imputed_X_train = pd.DataFrame(s_i.fit_transform(OH_X_train))
imputed_X_valid = pd.DataFrame(s_i.transform(OH_X_valid))
imputed_X_train.columns= OH_X_train.columns
imputed_X_valid.columns= OH_X_valid.columns
imputed_X_test =pd.DataFrame(s_i.transform(OH_X_test), columns =OH_X_train.columns)


In [79]:
#Importing XGBoost and root_mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error
 
#Combining training and validation for full dataset
X_final = np.vstack([imputed_X_train,imputed_X_valid])
y_final = np.hstack([y_train_clean,y_valid])

#function computed to print the rmse score
def score_data(X_train,y_train,X_valid , y_valid):
    model = XGBRegressor(n_estimators=860, 
                          max_depth=8, 
                          learning_rate=0.032, 
                          booster ='gbtree', 
                          objective ="reg:squarederror", 
                          eval_metric ='rmse', 
                          subsample=0.86,
                          colsample_bytree=0.8,
                          reg_lambda=5,
                          reg_alpha=0.4,
                          random_state=41,
)
    model.fit(X_train, 
              y_train, 
              eval_set= [(X_valid, y_valid)], 
              verbose= False)
    preds = model.predict(X_valid)
    return root_mean_squared_error(y_valid, preds)    
    
print(score_data(imputed_X_train,y_train_clean,imputed_X_valid , y_valid))

#The final XGBoost model for the full dataset
model_full = XGBRegressor(n_estimators=860, 
                          max_depth=8, 
                          learning_rate=0.032, 
                          booster ='gbtree', 
                          objective ="reg:squarederror", 
                          eval_metric ='rmse', 
                          subsample=0.86,
                          colsample_bytree=0.8,
                          reg_lambda=5,
                          reg_alpha=0.4,
                          random_state=41,
)
model_full.fit(X_final, y_final, verbose= False)
preds2 = model_full.predict(imputed_X_test)
#printing the first 5 results/prediction
print(preds2[:5])

8.750436901450229
[72.68659  70.907814 84.70799  53.743515 43.741096]


In [78]:
#Importing the libraries for pipelining and cross-validation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer, make_column_selector as selector

#selecting potential leaks
potential_leaks= [ 'exam_difficulty', 'age', 'course', 'internet_activity','gender']

#dropping potential leaks
X_train_drop = X_train_full.drop(columns = [i for i in potential_leaks if i in X_train_full.columns])

numeric_cols= X_train_drop.select_dtypes(include =["int64", "float64"]).columns
categorical_cols=X_train_drop.select_dtypes(include =["object", "category", "bool"]).columns

#preprocessing of data
preprocess = ColumnTransformer(
    transformers =[
        ("num", SimpleImputer(strategy ="median"), numeric_cols),
        ("cat", Pipeline(steps=[("imputer", SimpleImputer(strategy ="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]), categorical_cols),
    ]
)
#pipeline model 
model_pipe = XGBRegressor(n_estimators=860, 
                          max_depth=8, 
                          learning_rate=0.032, 
                          booster ='gbtree', 
                          objective ="reg:squarederror", 
                          eval_metric ='rmse', 
                          subsample=0.86,
                          colsample_bytree=0.8,
                          reg_lambda=5,
                          reg_alpha=0.4,
                          random_state=49)
pipe = Pipeline(steps=[("prep", preprocess), ("model", model_pipe)])
#computing cross-validation scores
cv_scores = cross_val_score(pipe,
                            X_train_drop, 
                            y_train_clean, 
                            cv=5, 
                            scoring ='neg_root_mean_squared_error', 
                            error_score ='raise')
print("Cross vals: ", -cv_scores.mean())


Cross vals:  8.759446092668204


In [None]:
#### function computed to ascertain optimal hyperparameters
def get_score_opt(n_estimators):
    pipe_line_second = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', XGBRegressor(
                          random_state=41,
                          learning_rate=0.032,
                          n_estimators= n_estimators, 
                          max_depth= 8, 
                          booster ='gbtree', 
                          objective = 'reg:squarederror', 
                          eval_metric ='rmse', 
                          subsample= 0.86,
                          colsample_bytree=0.8,
                          reg_lambda=5,
                          reg_alpha=0.4
                          ))
    ])
    avg_mse = -1* cross_val_score(
        pipe_line_second, 
        imputed_X_train, 
        y_train,
        cv =5, 
        scoring= 'neg_root_mean_squared_error', 
        error_score= 'raise'
    )
    return avg_mse.mean()
#iteration to display the behaviour of the model of the specified function parameter  
range_opt = {}
for i in range(1,10):
    iterate = 10*i +800
    range_opt[iterate] = get_score_opt(iterate)
    print(iterate, range_opt[iterate])

810 8.758324409870072
820 8.75830224441226
830 8.758284716238776
840 8.758309710412915
850 8.75826253201598
860 8.758254227070084
870 8.758328347546989
880 8.758286636476607
890 8.758339392203704


In [80]:


#creating a notebook output/dataset
sample =pd.read_csv('/kaggle/input/playground-series-s6e1/sample_submission.csv')


output = sample.copy()
output['exam_score'] = preds2
output.to_csv('output5.csv', index = False)

print(output.head)

<bound method NDFrame.head of             id  exam_score
0       630000   72.686592
1       630001   70.907814
2       630002   84.707993
3       630003   53.743515
4       630004   43.741096
...        ...         ...
269995  899995   58.845261
269996  899996   37.589554
269997  899997   75.613831
269998  899998   58.394192
269999  899999   70.382912

[270000 rows x 2 columns]>
