In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  **** Academic Success Prediction "Easiest" in a much simpler way than any other submission | Logistic Regression, Random Forest Classifier****

# If you find my notebook helpful, please consider giving an upvote.

**Dataset:**

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Predict Students' Dropout and Academic Success dataset. Feature distributions are close to, but not exactly the same, as the original.

**Columns and Their Meanings:**

1. id: Unique identifier for each entry.
2. Marital status: Marital status of the student (integer-encoded).
3. Application mode: Mode of application (integer-encoded).
4. Application order: Order of application (integer-encoded).
5. Course: Course the student is enrolled in (integer-encoded).
6. Daytime/evening attendance: Whether the student attends classes during the day or evening (integer-encoded).
7. Previous qualification: Previous educational qualification (integer-encoded).
8. Previous qualification (grade): Grade of the previous qualification (float64).
9. Nationality: Nationality of the student (integer-encoded).
10. Mother's qualification: Educational qualification of the student's mother (integer-encoded).
11. Father's qualification: Educational qualification of the student's father (integer-encoded).
12. Mother's occupation: Occupation of the student's mother (integer-encoded).
13. Father's occupation: Occupation of the student's father (integer-encoded).
14. Admission grade: Admission grade of the student (float64).
15. Displaced: Whether the student is displaced (integer-encoded).
16. Educational special needs: Whether the student has educational special needs (integer-encoded).
17. Debtor: Whether the student is a debtor (integer-encoded).
18. Tuition fees up to date: Whether the student's tuition fees are up to date (integer-encoded).
19. Gender: Gender of the student (integer-encoded).
20. Scholarship holder: Whether the student is a scholarship holder (integer-encoded).
21. Age at enrollment: Age of the student at the time of enrollment (integer-encoded).
22. International: Whether the student is an international student (integer-encoded).
23. Curricular units 1st sem (credited): Number of curricular units credited in the first semester (integer-encoded).
24. Curricular units 1st sem (enrolled): Number of curricular units enrolled in the first semester (integer-encoded).
25. Curricular units 1st sem (evaluations): Number of curricular units evaluated in the first semester (integer-encoded).
26. Curricular units 1st sem (approved): Number of curricular units approved in the first semester (integer-encoded).
27. Curricular units 1st sem (grade): Grade in the curricular units in the first semester (float64).
28. Curricular units 1st sem (without evaluations): Number of curricular units without evaluations in the first semester (integer-encoded).
29. Curricular units 2nd sem (credited): Number of curricular units credited in the second semester (integer-encoded).
30. Curricular units 2nd sem (enrolled): Number of curricular units enrolled in the second semester (integer-encoded).
31. Curricular units 2nd sem (evaluations): Number of curricular units evaluated in the second semester (integer-encoded).
32. Curricular units 2nd sem (approved): Number of curricular units approved in the second semester (integer-encoded).
33. Curricular units 2nd sem (grade): Grade in the curricular units in the second semester (float64).
34. Curricular units 2nd sem (without evaluations): Number of curricular units without evaluations in the second semester (integer-encoded).
35. Unemployment rate: Unemployment rate (float64).
36. Inflation rate: Inflation rate (float64).
37. GDP: Gross Domestic Product (float64).
38. Target: The target variable (could be the outcome of interest, object type which may indicate categorical data).

**Non-Null Count:**
Each column has 76,518 non-null entries, indicating that there are no missing values in the dataset.


In [None]:
# Import necessary libraries

import pandas as pd
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
#Load train.csv dataset
train=pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
train.sample(50)

In [None]:
#Load test.csv dataset
test=pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')
test.sample(10)

In [None]:
# Shape
print(train.shape)
print(test.shape)

In [None]:
# Info of Dataset
train.info()

In [None]:
train.describe()

In [None]:
# Check for null Values
train.isna().sum()

In [None]:
# Check for duplicate rows 
train.duplicated().sum()

In [None]:
# Label Encoder
# Dropout = 0
# Enrolled = 1
# Graduate = 2

train['Target']=encoder.fit_transform(train['Target'])
train['Target'].head(10)

In [None]:
# Count values of each category 

train['Target'].value_counts()

# Dropout = 0
# Enrolled = 1
# Graduate = 2

#                                          Exploratory Data Analysis

In [None]:
# Pie plot for category distribuition

plt.pie(train['Target'].value_counts(),labels=['Dropout','Enrolled','Graduate'],autopct='%0.2f')
plt.show()

In [None]:
# Check for unique values in each column

for col in train.columns:
  print(f'Unique values --> {col} : {train[col].nunique()}')

In [None]:
# Seperate categorical and numerical columns

categorical_columns=['Marital status','Application mode','Nacionality',"Mother's occupation","Father's occupation","Mother's qualification","Father's qualification",'Previous qualification','Application order','Daytime/evening attendance','Displaced','Educational special needs','Debtor','Tuition fees up to date','Gender','Scholarship holder','International','Target']
numerical_columns=['Course','Previous qualification (grade)','Curricular units 2nd sem (credited)','Curricular units 2nd sem (enrolled)','Curricular units 2nd sem (evaluations)','Curricular units 2nd sem (approved)','Curricular units 2nd sem (grade)','Curricular units 2nd sem (without evaluations)','Unemployment rate','Inflation rate','GDP']

# **Distribution**

In [None]:
# Distribuition of Academic success
# Dropout = 0
# Enrolled = 1
# Graduate = 2

plt.figure(figsize=(6, 6))
sns.countplot(data=train, x='Target',edgecolor='black')
plt.title('Distribution of Academic Success')
plt.xlabel('Academic Success')
plt.ylabel('Count')
plt.show();


# Dropout = 0
# Enrolled = 1
# Graduate = 2

In [None]:
# Boxplot to check outliers

plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_columns):
    plt.subplot(4, 3, i+1)
    sns.boxplot(data=train, y=feature)
    plt.title(f'{feature} by Academic Success')
    plt.xlabel(' ')
    plt.ylabel(feature)
plt.tight_layout()
plt.show();

In [None]:
# Countplot to check number of daytime/evening attendance in Target column

sns.countplot(train,x='Target',hue='Daytime/evening attendance')

In [None]:
# Scatterplot to plot Previous qualification (grade) vs Curricular units 2nd sem (grade) and highlighting categories of target column and each scattered point is highlighting daytime/evening attendance

plt.figure(figsize=(10, 10))
sns.scatterplot(data=train,x='Previous qualification (grade)',y='Curricular units 2nd sem (grade)',hue='Target',style='Daytime/evening attendance',palette='viridis')
plt.show()

In [None]:
# Lineplot of Previous qualification (grade) vs Unemployment rate highlighting categories of Target column

plt.figure(figsize=(20, 10))
sns.lineplot(data=train, x="Previous qualification (grade)", y="Unemployment rate", hue="Target")
plt.show()

In [None]:
# Scatterplot of Curricular units 2nd sem (grade) vs Curricular units 1st sem (grade) highlighting target column and every scattered point represents type of gender

plt.figure(figsize=(15, 10))
sns.scatterplot(data=train,x='Curricular units 2nd sem (grade)',y='Curricular units 1st sem (grade)',hue='Target',style='Gender')
plt.show()

In [None]:
# Relplpot of Curricular units 1st sem (grade) vs Curricular units 2nd sem (grade) where every point represents gender

plt.figure(figsize=(15, 10))
sns.relplot(data=train, x="Curricular units 1st sem (grade)", y="Curricular units 2nd sem (grade)", hue="Gender", col="Daytime/evening attendance", row="Target")
plt.show()

# **Correlation matrix**

In [None]:
plt.figure(figsize=(21, 18))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm', fmt='.1f', linewidths=2, linecolor='lightgrey')
plt.suptitle('Correlation Matrix', fontsize=40, y=1)
plt.show()

# **Modelling**

**1. Logistic Regression**

In [None]:
# Drop Target and id column and store features in x, and target in y

x = train.drop(columns=['Target','id'])
y = train['Target']

In [None]:
# Splitting Data into x_train, x_test, y_train, y_test to train and test the model

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify=y,test_size=0.2,random_state=2)

In [None]:
# Create a object of LogesticRegression 

lr = LogisticRegression(multi_class = 'ovr')

In [None]:
# train the model

lr.fit(x_train,y_train)

In [None]:
# Prediction and check accuracy score

pred=lr.predict(x_test)
print(accuracy_score(y_test,pred))

**2. Random Forest Classifier**

In [None]:
# Train Random Forest classifier model

rfc.fit(x_train,y_train)

In [None]:
# Prediction

y_pred_rfc = rfc.predict(x_test)

In [None]:
# Accuracy score of Random Forest model: 0.8282801881860952 which is fine for this large dataset


print(accuracy_score(y_test,y_pred_rfc))

In [None]:
# Drop id column in test dataset

test1 = test.drop(columns='id')

In [None]:
# test the model with test dataset given

final_test=rfc.predict(test1)
final_test

In [None]:
# convert the final_test array into DataFrame and store it in object - output

output = pd.DataFrame({
    'id' : test.id,
    'Target' : final_test
})

In [None]:
# label categories of Target column

output['Target']=output['Target'].map({0:'Dropout',1:'Enrolled',2:'Graduate'})

In [None]:
# sample of submission.csv file

output.head(20)

In [None]:
# Convert the above dataframe into csv file to submit

output.to_csv('submission.csv',index=False)