In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e1/sample_submission.csv
/kaggle/input/playground-series-s6e1/train.csv
/kaggle/input/playground-series-s6e1/test.csv


In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

train_df = pd.read_csv('/kaggle/input/playground-series-s6e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s6e1/test.csv')

train_df.head()
# train_df.columns

Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0


In [15]:
print(train_df.select_dtypes(include=['object']).nunique())

gender             3
course             7
internet_access    2
sleep_quality      3
study_method       5
facility_rating    3
exam_difficulty    3
dtype: int64


In [67]:
train_df = train_df.dropna(axis=0, subset=["exam_score"])
y=train_df["exam_score"]
X_train = train_df.drop(["exam_score", "id"], axis=1)
X_test = test_df.drop(["id"], axis=1)

# X_train = X_train.head(30000)
# y = y.head(30000)

In [68]:
ordinal_features = ["sleep_quality", "facility_rating", "exam_difficulty"]
# numerical_features = (X_train.select_dtypes(include=["number"])).columns.tolist()
numerical_features = ["age", "study_hours", "class_attendance", "sleep_hours"]
numerical_features += ordinal_features

categorical_features = [col for col in X_train.columns if col not in numerical_features]
# categorical_features = ['gender', 'course', 'internet_access', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [69]:
X_train["effective_study_hours"] = X_train["study_hours"] * (X_train["class_attendance"]/100)
X_test["effective_study_hours"] = X_test["study_hours"] * (X_test["class_attendance"]/100)
numerical_features.append("effective_study_hours")



X_train["study_hours_ratio"] = X_train["study_hours"] / X_train["sleep_hours"]
X_test["study_hours_ratio"] = X_test["study_hours"] / X_test["sleep_hours"]
numerical_features.append("study_hours_ratio")


X_train["combined_study_sleep"] = X_train["effective_study_hours"] * X_train["study_hours_ratio"]
X_test["combined_study_sleep"] = X_test["effective_study_hours"] * X_test["study_hours_ratio"]
numerical_features.append("combined_study_sleep")


sleep_quality_map = {"poor": 0, "average": 1, "good": 2}
facility_rating_map = {"low": 0, "medium": 1, "high": 2}
exam_difficulty_map = {"easy": 0, "medium": 1, "hard": 2}
X_train["sleep_quality"] = X_train["sleep_quality"].map(sleep_quality_map)
X_train["facility_rating"] = X_train["facility_rating"].map(facility_rating_map)
X_train["exam_difficulty"] = X_train["exam_difficulty"].map(exam_difficulty_map)
X_test["sleep_quality"] = X_test["sleep_quality"].map(sleep_quality_map)
X_test["facility_rating"] = X_test["facility_rating"].map(facility_rating_map)
X_test["exam_difficulty"] = X_test["exam_difficulty"].map(exam_difficulty_map)

In [70]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

category_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_features),
    ("cat", category_transformer, categorical_features)
])

model = RandomForestRegressor(n_estimators=50, max_depth=8, random_state=1)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

scores = cross_val_score(clf, X_train, y, cv=3, scoring="r2")
print("CV R2:", scores.mean())

CV R2: 0.7556990428642353


In [71]:
clf.fit(X_train,y)
y_pred = clf.predict(X_test)
submission = pd.DataFrame({
    "id": test_df["id"],
    "exam_score": y_pred
})

# Save to CSV
submission.to_csv("exam_score_predictions.csv", index=False)