# Training Process

In [365]:
# Import libraries

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [366]:
# LOad Dataset 
df=pd.read_csv('student_data.csv')


In [367]:
# Preview data 
df.head()

Unnamed: 0,Student ID,Student Name,Date of Birth,Field of Study,Year of Admission,Expected Year of Graduation,Current Semester,Specialization,Fees,Discount on Fees
0,165527,Bryan Rogers,19/01/2006,Computer Science,2020.0,2017.0,3,Web Development,155152,19572
1,635763,James Hogan,23/05/1999,Mechanical Engineering,2020.0,2020.0,2,Machine Learning,157870,14760
2,740021,David Robinson,02/12/1997,Civil Engineering,2017.0,2022.0,1,Network Security,55662,5871
3,433076,Susan Miller,30/10/1999,Computer Science,2021.0,2019.0,1,Data Science,134955,17284
4,441628,Brittany Martin,10/01/1998,Chemical Engineering,2016.0,2018.0,1,Network Security,125934,14871


In [368]:
# Dataset information 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Student ID                   200000 non-null  int64  
 1   Student Name                 200000 non-null  object 
 2   Date of Birth                200000 non-null  object 
 3   Field of Study               199997 non-null  object 
 4   Year of Admission            199998 non-null  float64
 5   Expected Year of Graduation  199999 non-null  float64
 6   Current Semester             200000 non-null  int64  
 7   Specialization               199998 non-null  object 
 8   Fees                         200000 non-null  int64  
 9   Discount on Fees             200000 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 15.3+ MB


In [369]:
df.describe()

Unnamed: 0,Student ID,Year of Admission,Expected Year of Graduation,Current Semester,Fees,Discount on Fees
count,200000.0,199998.0,199999.0,200000.0,200000.0,200000.0
mean,549367.492925,2018.99766,2019.995225,2.49902,125092.847595,12484.258575
std,259361.565011,2.002375,1.997744,1.117804,43287.894903,8788.362629
min,100001.0,2016.0,2017.0,1.0,50000.0,0.0
25%,325311.0,2017.0,2018.0,1.0,87641.5,5383.0
50%,548855.5,2019.0,2020.0,2.0,125221.0,10792.5
75%,774182.5,2021.0,2022.0,3.0,162597.25,18154.0
max,999997.0,2022.0,2023.0,4.0,200000.0,39865.0


## Data Preprocessing

In [370]:
# Drop unnessary columns

df.drop(['Student ID', 'Student Name'], axis=1, inplace=True)




In [371]:
# Handle missing values 

df.isnull().sum()

Date of Birth                  0
Field of Study                 3
Year of Admission              2
Expected Year of Graduation    1
Current Semester               0
Specialization                 2
Fees                           0
Discount on Fees               0
dtype: int64

In [372]:
# See only missing columns 

missing_columns = df.columns[df.isnull().any()]
print("Columns with missing values:", missing_columns.tolist())


Columns with missing values: ['Field of Study', 'Year of Admission', 'Expected Year of Graduation', 'Specialization']


In [373]:
# Display column names with missing values in a clean format

missing_columns = df.columns[df.isnull().any()]
print("Columns with missing values:")
for col in missing_columns:
    print(f"- {col}")


Columns with missing values:
- Field of Study
- Year of Admission
- Expected Year of Graduation
- Specialization


In [374]:
# Identify columns with missing values
missing_columns = df.isnull().sum() > 0

# Handle missing values
for col in df.columns[missing_columns]:  # Loop through columns with missing values
    if df[col].dtype == 'object':  # If the column is categorical
        df[col].fillna(df[col].mode()[0], inplace=True)  # Fill with the most frequent value (mode)
    else:  # If the column is numeric
        df[col].fillna(df[col].mean(), inplace=True)  # Fill with the mean


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)  # Fill with the most frequent value (mode)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)  # Fill with the mean


In [375]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Date of Birth                200000 non-null  object 
 1   Field of Study               200000 non-null  object 
 2   Year of Admission            200000 non-null  float64
 3   Expected Year of Graduation  200000 non-null  float64
 4   Current Semester             200000 non-null  int64  
 5   Specialization               200000 non-null  object 
 6   Fees                         200000 non-null  int64  
 7   Discount on Fees             200000 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 12.2+ MB


In [376]:
df.head()

Unnamed: 0,Date of Birth,Field of Study,Year of Admission,Expected Year of Graduation,Current Semester,Specialization,Fees,Discount on Fees
0,19/01/2006,Computer Science,2020.0,2017.0,3,Web Development,155152,19572
1,23/05/1999,Mechanical Engineering,2020.0,2020.0,2,Machine Learning,157870,14760
2,02/12/1997,Civil Engineering,2017.0,2022.0,1,Network Security,55662,5871
3,30/10/1999,Computer Science,2021.0,2019.0,1,Data Science,134955,17284
4,10/01/1998,Chemical Engineering,2016.0,2018.0,1,Network Security,125934,14871


# Encoding

In [377]:
# Spreate categorical columns

categorical_col=df.select_dtypes(include=['object','category']).columns
categorical_col

Index(['Date of Birth', 'Field of Study', 'Specialization'], dtype='object')

In [378]:
# Cardinality

cardinality = df[categorical_col].nunique()
print(cardinality)


Date of Birth     3286
Field of Study       5
Specialization       5
dtype: int64


In [379]:
df['Date of Birth']

0         19/01/2006
1         23/05/1999
2         02/12/1997
3         30/10/1999
4         10/01/1998
             ...    
199995    06/06/2001
199996    15/12/2003
199997    27/07/2000
199998    16/02/2001
199999    17/10/2005
Name: Date of Birth, Length: 200000, dtype: object

In [380]:
# Convert 'Date of Birth' to datetime and split into Year, Month, Day

df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')

# Extract Year, Month, and Day from 'Date of Birth'

df['Year_of_Birth'] = df['Date of Birth'].dt.year
df['Month_of_Birth'] = df['Date of Birth'].dt.month
df['Day_of_Birth'] = df['Date of Birth'].dt.day

# Drop the original 'Date of Birth' column

df.drop(columns=['Date of Birth'], inplace=True)
print(df.columns)



  df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], errors='coerce')


Index(['Field of Study', 'Year of Admission', 'Expected Year of Graduation',
       'Current Semester', 'Specialization', 'Fees', 'Discount on Fees',
       'Year_of_Birth', 'Month_of_Birth', 'Day_of_Birth'],
      dtype='object')


In [381]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Field of Study               200000 non-null  object 
 1   Year of Admission            200000 non-null  float64
 2   Expected Year of Graduation  200000 non-null  float64
 3   Current Semester             200000 non-null  int64  
 4   Specialization               200000 non-null  object 
 5   Fees                         200000 non-null  int64  
 6   Discount on Fees             200000 non-null  int64  
 7   Year_of_Birth                200000 non-null  int32  
 8   Month_of_Birth               200000 non-null  int32  
 9   Day_of_Birth                 200000 non-null  int32  
dtypes: float64(2), int32(3), int64(3), object(2)
memory usage: 13.0+ MB


In [382]:
# Initialize OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output = False, drop = 'first')


In [383]:
# Display the column names of the DataFrame

print(df.columns)

Index(['Field of Study', 'Year of Admission', 'Expected Year of Graduation',
       'Current Semester', 'Specialization', 'Fees', 'Discount on Fees',
       'Year_of_Birth', 'Month_of_Birth', 'Day_of_Birth'],
      dtype='object')


In [384]:
# Select columns to encode

columns_to_encoder = ['Field of Study', 'Specialization']
encoded_array =  one_hot_encoder.fit_transform(df[columns_to_encoder])

In [385]:
# Convert encoded data into a DataFrame

encoded_df = pd.DataFrame(encoded_array, columns=one_hot_encoder.get_feature_names_out(columns_to_encoder))


In [386]:
# Merge encoded columns with the original DataFrame

df = df.drop(columns=columns_to_encoder).reset_index(drop=True)  # Kodlangan ustunlarni o‘chirish
df = pd.concat([df, encoded_df], axis=1)

In [387]:
df.head()

Unnamed: 0,Year of Admission,Expected Year of Graduation,Current Semester,Fees,Discount on Fees,Year_of_Birth,Month_of_Birth,Day_of_Birth,Field of Study_Civil Engineering,Field of Study_Computer Science,Field of Study_Electrical Engineering,Field of Study_Mechanical Engineering,Specialization_Data Science,Specialization_Machine Learning,Specialization_Network Security,Specialization_Web Development
0,2020.0,2017.0,3,155152,19572,2006,1,19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2020.0,2020.0,2,157870,14760,1999,5,23,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,2017.0,2022.0,1,55662,5871,1997,12,2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2021.0,2019.0,1,134955,17284,1999,10,30,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2016.0,2018.0,1,125934,14871,1998,1,10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [388]:
# Display the first few rows of the DataFrame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Year of Admission                      200000 non-null  float64
 1   Expected Year of Graduation            200000 non-null  float64
 2   Current Semester                       200000 non-null  int64  
 3   Fees                                   200000 non-null  int64  
 4   Discount on Fees                       200000 non-null  int64  
 5   Year_of_Birth                          200000 non-null  int32  
 6   Month_of_Birth                         200000 non-null  int32  
 7   Day_of_Birth                           200000 non-null  int32  
 8   Field of Study_Civil Engineering       200000 non-null  float64
 9   Field of Study_Computer Science        200000 non-null  float64
 10  Field of Study_Electrical Engineering  200000 non-null  

In [389]:
# Convert all columns to integer data type

df[df.columns] = df[df.columns].astype(int)

In [390]:
# Display the first few rows of the updated DataFrame

df.head()

Unnamed: 0,Year of Admission,Expected Year of Graduation,Current Semester,Fees,Discount on Fees,Year_of_Birth,Month_of_Birth,Day_of_Birth,Field of Study_Civil Engineering,Field of Study_Computer Science,Field of Study_Electrical Engineering,Field of Study_Mechanical Engineering,Specialization_Data Science,Specialization_Machine Learning,Specialization_Network Security,Specialization_Web Development
0,2020,2017,3,155152,19572,2006,1,19,0,1,0,0,0,0,0,1
1,2020,2020,2,157870,14760,1999,5,23,0,0,0,1,0,1,0,0
2,2017,2022,1,55662,5871,1997,12,2,1,0,0,0,0,0,1,0
3,2021,2019,1,134955,17284,1999,10,30,0,1,0,0,1,0,0,0
4,2016,2018,1,125934,14871,1998,1,10,0,0,0,0,0,0,1,0


In [391]:
# Display DataFrame summary

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column                                 Non-Null Count   Dtype
---  ------                                 --------------   -----
 0   Year of Admission                      200000 non-null  int64
 1   Expected Year of Graduation            200000 non-null  int64
 2   Current Semester                       200000 non-null  int64
 3   Fees                                   200000 non-null  int64
 4   Discount on Fees                       200000 non-null  int64
 5   Year_of_Birth                          200000 non-null  int64
 6   Month_of_Birth                         200000 non-null  int64
 7   Day_of_Birth                           200000 non-null  int64
 8   Field of Study_Civil Engineering       200000 non-null  int64
 9   Field of Study_Computer Science        200000 non-null  int64
 10  Field of Study_Electrical Engineering  200000 non-null  int64
 11  Field of Stud

# Training Process

In [392]:
# Define input (x) and output (y)

x=df.drop('Discount on Fees', axis=1) #Input
y=df['Discount on Fees'] #Output

In [393]:
# Display the first few rows of the input data

x.head()

Unnamed: 0,Year of Admission,Expected Year of Graduation,Current Semester,Fees,Year_of_Birth,Month_of_Birth,Day_of_Birth,Field of Study_Civil Engineering,Field of Study_Computer Science,Field of Study_Electrical Engineering,Field of Study_Mechanical Engineering,Specialization_Data Science,Specialization_Machine Learning,Specialization_Network Security,Specialization_Web Development
0,2020,2017,3,155152,2006,1,19,0,1,0,0,0,0,0,1
1,2020,2020,2,157870,1999,5,23,0,0,0,1,0,1,0,0
2,2017,2022,1,55662,1997,12,2,1,0,0,0,0,0,1,0
3,2021,2019,1,134955,1999,10,30,0,1,0,0,1,0,0,0
4,2016,2018,1,125934,1998,1,10,0,0,0,0,0,0,1,0


In [394]:
# Import train_test_split

from sklearn.model_selection import train_test_split

In [395]:
# Split data into training and testing sets

x_train,y_train,x_test, y_test=train_test_split(x,y,test_size=0.3, random_state=42)

In [396]:
# Check the lengths of the training and testing sets

print("x_train length:", len(x_train))
print("x_test length:", len(x_test))
print("y_train length:", len(y_train))
print("y_test length:", len(y_test))


x_train length: 140000
x_test length: 140000
y_train length: 60000
y_test length: 60000


In [397]:
# Check the shapes of the training and testing sets

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


x_train shape: (140000, 15)
x_test shape: (140000,)
y_train shape: (60000, 15)
y_test shape: (60000,)


In [398]:
# Split the data into training and testing sets

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3, random_state=42)

In [399]:
# Import the Linear Regression model

from sklearn.linear_model import LinearRegression

In [400]:
# Initialize the Linear Regression model

model=LinearRegression()  

In [401]:
# Train the Linear Regression model

model.fit(x_train,y_train)

In [402]:
# Make predictions on the test set

y_pred=model.predict(x_test)

In [403]:
# Display the first 5 predicted values

y_pred[:5]

array([ 5294.83538437, 14529.20294363, 15869.26586992,  5885.34247457,
        6462.04618019])