# Preparing Dummy data 

In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
num_students = 10000

# Generate student data
data = pd.DataFrame({
    'Roll No': range(1, num_students + 1),
    'Gender': np.random.choice(['Male', 'Female'], size=num_students),
    '10th Percentage': np.random.randint(0, 101, size=num_students),
    '12th Percentage': np.random.randint(0, 101, size=num_students),
    '12th Stream': np.random.choice(['Science', 'Art', 'Commerce'], size=num_students),
    'Family Count': np.random.randint(0, 11, size=num_students),
    'Father Occupation': np.random.choice(['Government Job', 'Private Job', 'Army', 'Housewife', 'Business', 'Farmer', 'No Job'], size=num_students),
    'Mother Occupation': np.random.choice(['Government Job', 'Private Job', 'Army', 'Housewife', 'Business', 'Farmer', 'No Job'], size=num_students),
    'Siblings Count': np.random.randint(0, 6, size=num_students),
})

# Generate random subject marks for each semester
subjects_all_sem = [
    ['English', 'Math', 'Basic Science', 'ICT', 'WPC'],
    ['EEC', 'AMI', 'BEC', 'PCI', 'BCC', 'CPH', 'WPD'],
    ['OOP', 'DSU', 'CGR', 'DMS', 'DTE'],
    ['JPR', 'SEN', 'DCC', 'MIC', 'GAD']
]

for i, subjects in enumerate(subjects_all_sem):
    data = pd.concat([data, pd.DataFrame(np.random.randint(0, 101, size=(num_students, len(subjects))), columns=[f"{subject}_{i+1}st_Sem" for subject in subjects])], axis=1)

# Generate 5th semester overall percentage (target variable)
data['Overall Percentage (5th Sem)'] = np.random.randint(0, 101, size=num_students)

# Assign target groups using numpy.select
conditions = [
    data['Overall Percentage (5th Sem)'] < 35,
    (data['Overall Percentage (5th Sem)'] >= 35) & (data['Overall Percentage (5th Sem)'] < 50),
    (data['Overall Percentage (5th Sem)'] >= 50) & (data['Overall Percentage (5th Sem)'] < 75),
    data['Overall Percentage (5th Sem)'] >= 75
]
choices = ['<35', '35 to 50', '50 to 75', '75 to 100']
data['Target Group'] = np.select(conditions, choices, default='NA')  # Set a default value if none of the conditions are met

# Map marks to target groups
def map_marks_to_groups(row):
    marks_cols = [col for col in data.columns if col.endswith('Sem')]
    for col in marks_cols:
        row[col] = np.minimum(row[col] * 0.1 + 5 if row['Target Group'] == '<35' else
                               row[col] * 0.4 + 50 if row['Target Group'] == '35 to 50' else
                               row[col] * 0.7 + 75 if row['Target Group'] == '50 to 75' else
                               row[col], 100)  # Ensure marks don't exceed 100
    return row

data = data.apply(map_marks_to_groups, axis=1)

# Save the dataframe to a CSV file
data.to_csv('dummy_student_data_10000.csv', index=False)
print("Dummy student data generated and saved to 'dummy_student_data_10000.csv'.")

Dummy student data generated and saved to 'dummy_student_data_10000.csv'.


# Uploading data into Database

In [2]:
import sqlite3

In [3]:
# Connect to SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('student_data.db')
cursor = conn.cursor()

In [4]:
# Create a table to store the student data
cursor.execute('''CREATE TABLE IF NOT EXISTS student_data (
                    Roll_No INTEGER PRIMARY KEY,
                    Gender TEXT,
                    "10th_Percentage" REAL,
                    "12th_Percentage" REAL,
                    "12th_Stream" TEXT,
                    Family_Count INTEGER,
                    Father_Occupation TEXT,
                    Mother_Occupation TEXT,
                    Siblings_Count INTEGER,
                    English_1st_Sem REAL,
                    Math_1st_Sem REAL,
                    Basic_Science_1st_Sem REAL,
                    ICT_1st_Sem REAL,
                    WPC_1st_Sem REAL,
                    EEC_2nd_Sem REAL,
                    AMI_2nd_Sem REAL,
                    BEC_2nd_Sem REAL,
                    PCI_2nd_Sem REAL,
                    BCC_2nd_Sem REAL,
                    CPH_2nd_Sem REAL,
                    WPD_2nd_Sem REAL,
                    OOP_3rd_Sem REAL,
                    DSU_3rd_Sem REAL,
                    CGR_3rd_Sem REAL,
                    DMS_3rd_Sem REAL,
                    DTE_3rd_Sem REAL,
                    JPR_4th_Sem REAL,
                    SEN_4th_Sem REAL,
                    DCC_4th_Sem REAL,
                    MIC_4th_Sem REAL,
                    GAD_4th_Sem REAL,
                    "Overall_Percentage_(5th_Sem)" REAL,
                    Target_Group TEXT
                 )''')


<sqlite3.Cursor at 0x184d9d3fd40>

In [5]:
# Upload the DataFrame to the SQLite database
data.to_sql('student_data', conn, if_exists='replace', index=False)
# Commit changes and close the connection
conn.commit()
conn.close()

# Reading data from Database

In [6]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('student_data.db')

# Read the table into a DataFrame
data = pd.read_sql_query('SELECT * FROM student_data', conn)
# Close the connection
conn.close()

In [7]:
data.head()

Unnamed: 0,Roll No,Gender,10th Percentage,12th Percentage,12th Stream,Family Count,Father Occupation,Mother Occupation,Siblings Count,English_1st_Sem,...,CGR_3st_Sem,DMS_3st_Sem,DTE_3st_Sem,JPR_4st_Sem,SEN_4st_Sem,DCC_4st_Sem,MIC_4st_Sem,GAD_4st_Sem,Overall Percentage (5th Sem),Target Group
0,1,Male,50,52,Art,3,Farmer,Private Job,1,84.4,...,79.2,90.0,51.2,70.0,64.8,80.4,70.8,82.0,45,35 to 50
1,2,Female,70,90,Science,1,Army,Housewife,0,10.0,...,11.3,14.7,12.5,5.6,14.0,10.0,12.5,8.1,5,<35
2,3,Male,95,80,Art,5,Private Job,No Job,5,31.0,...,26.0,56.0,37.0,54.0,100.0,32.0,77.0,56.0,77,75 to 100
3,4,Male,47,65,Art,8,No Job,Government Job,1,9.0,...,15.0,46.0,40.0,53.0,87.0,43.0,47.0,9.0,87,75 to 100
4,5,Male,18,30,Art,9,Business,Housewife,5,15.0,...,9.8,7.5,10.3,14.5,12.5,6.6,14.2,5.1,32,<35


In [8]:
data.head()

Unnamed: 0,Roll No,Gender,10th Percentage,12th Percentage,12th Stream,Family Count,Father Occupation,Mother Occupation,Siblings Count,English_1st_Sem,...,CGR_3st_Sem,DMS_3st_Sem,DTE_3st_Sem,JPR_4st_Sem,SEN_4st_Sem,DCC_4st_Sem,MIC_4st_Sem,GAD_4st_Sem,Overall Percentage (5th Sem),Target Group
0,1,Male,50,52,Art,3,Farmer,Private Job,1,84.4,...,79.2,90.0,51.2,70.0,64.8,80.4,70.8,82.0,45,35 to 50
1,2,Female,70,90,Science,1,Army,Housewife,0,10.0,...,11.3,14.7,12.5,5.6,14.0,10.0,12.5,8.1,5,<35
2,3,Male,95,80,Art,5,Private Job,No Job,5,31.0,...,26.0,56.0,37.0,54.0,100.0,32.0,77.0,56.0,77,75 to 100
3,4,Male,47,65,Art,8,No Job,Government Job,1,9.0,...,15.0,46.0,40.0,53.0,87.0,43.0,47.0,9.0,87,75 to 100
4,5,Male,18,30,Art,9,Business,Housewife,5,15.0,...,9.8,7.5,10.3,14.5,12.5,6.6,14.2,5.1,32,<35


In [9]:
data.columns.tolist()

['Roll No',
 'Gender',
 '10th Percentage',
 '12th Percentage',
 '12th Stream',
 'Family Count',
 'Father Occupation',
 'Mother Occupation',
 'Siblings Count',
 'English_1st_Sem',
 'Math_1st_Sem',
 'Basic Science_1st_Sem',
 'ICT_1st_Sem',
 'WPC_1st_Sem',
 'EEC_2st_Sem',
 'AMI_2st_Sem',
 'BEC_2st_Sem',
 'PCI_2st_Sem',
 'BCC_2st_Sem',
 'CPH_2st_Sem',
 'WPD_2st_Sem',
 'OOP_3st_Sem',
 'DSU_3st_Sem',
 'CGR_3st_Sem',
 'DMS_3st_Sem',
 'DTE_3st_Sem',
 'JPR_4st_Sem',
 'SEN_4st_Sem',
 'DCC_4st_Sem',
 'MIC_4st_Sem',
 'GAD_4st_Sem',
 'Overall Percentage (5th Sem)',
 'Target Group']

In [10]:
data['Target Group'].value_counts()

<35          3478
75 to 100    2546
50 to 75     2435
35 to 50     1541
Name: Target Group, dtype: int64

In [11]:
data['Target Group'].value_counts(normalize=True)

<35          0.3478
75 to 100    0.2546
50 to 75     0.2435
35 to 50     0.1541
Name: Target Group, dtype: float64

In [12]:
# matplotlib.use('Agg')  # Use the 'Agg' backend
# import sweetviz

In [13]:
# import pandas as pd
# import sweetviz as sv
# import matplotlib.pyplot as plt
# %matplotlib inline
# #EDA using Autoviz
# sweet_report = sv.analyze(data)
# #Saving results to HTML file
# sweet_report.show_html('sweet_report.html')

# EDA

In [14]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [15]:
categorical_cols = [col for col in data.columns if data[col].dtype == 'object']
categorical_cols

['Gender',
 '12th Stream',
 'Father Occupation',
 'Mother Occupation',
 'Target Group']

In [16]:
numerical_col=[col for col in data.columns if data[col].dtype != 'object']
numerical_col

['Roll No',
 '10th Percentage',
 '12th Percentage',
 'Family Count',
 'Siblings Count',
 'English_1st_Sem',
 'Math_1st_Sem',
 'Basic Science_1st_Sem',
 'ICT_1st_Sem',
 'WPC_1st_Sem',
 'EEC_2st_Sem',
 'AMI_2st_Sem',
 'BEC_2st_Sem',
 'PCI_2st_Sem',
 'BCC_2st_Sem',
 'CPH_2st_Sem',
 'WPD_2st_Sem',
 'OOP_3st_Sem',
 'DSU_3st_Sem',
 'CGR_3st_Sem',
 'DMS_3st_Sem',
 'DTE_3st_Sem',
 'JPR_4st_Sem',
 'SEN_4st_Sem',
 'DCC_4st_Sem',
 'MIC_4st_Sem',
 'GAD_4st_Sem',
 'Overall Percentage (5th Sem)']

In [17]:
data=pd.DataFrame(data)
data.shape

(10000, 33)

In [18]:
data.head()

Unnamed: 0,Roll No,Gender,10th Percentage,12th Percentage,12th Stream,Family Count,Father Occupation,Mother Occupation,Siblings Count,English_1st_Sem,...,CGR_3st_Sem,DMS_3st_Sem,DTE_3st_Sem,JPR_4st_Sem,SEN_4st_Sem,DCC_4st_Sem,MIC_4st_Sem,GAD_4st_Sem,Overall Percentage (5th Sem),Target Group
0,1,Male,50,52,Art,3,Farmer,Private Job,1,84.4,...,79.2,90.0,51.2,70.0,64.8,80.4,70.8,82.0,45,35 to 50
1,2,Female,70,90,Science,1,Army,Housewife,0,10.0,...,11.3,14.7,12.5,5.6,14.0,10.0,12.5,8.1,5,<35
2,3,Male,95,80,Art,5,Private Job,No Job,5,31.0,...,26.0,56.0,37.0,54.0,100.0,32.0,77.0,56.0,77,75 to 100
3,4,Male,47,65,Art,8,No Job,Government Job,1,9.0,...,15.0,46.0,40.0,53.0,87.0,43.0,47.0,9.0,87,75 to 100
4,5,Male,18,30,Art,9,Business,Housewife,5,15.0,...,9.8,7.5,10.3,14.5,12.5,6.6,14.2,5.1,32,<35


In [19]:
# !python -m pip install autoviz

In [20]:
#importing Autoviz class
from autoviz.AutoViz_Class import AutoViz_Class#Instantiate the AutoViz class
AV = AutoViz_Class()

Imported v0.1.808. Please call AutoViz in this sequence:
    AV = AutoViz_Class()
    %matplotlib inline
    dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
               chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)


In [21]:
df = AV.AutoViz('dummy_student_data_10000.csv')

Shape of your Data Set loaded: (10000, 33)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  22
    Number of Integer-Categorical Columns =  5
    Number of String-Categorical Columns =  4
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  1
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  1
    Number of Columns to Delete =  0
    33 Predictors classified...
        1 variable(s) removed since they were ID or low-information variables
        List of variables removed: ['Roll No']
22 numeric variables in data exceeds limit, taking top

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
Roll No,int64,0.0,100.0,1.0,10000.0,Possible ID column: drop before modeling step.
Gender,object,0.0,0.0,,,No issue
10th Percentage,int64,0.0,1.0,0.0,100.0,No issue
12th Percentage,int64,0.0,1.0,0.0,100.0,No issue
12th Stream,object,0.0,0.0,,,No issue
Family Count,int64,0.0,0.0,0.0,10.0,No issue
Father Occupation,object,0.0,0.0,,,No issue
Mother Occupation,object,0.0,0.0,,,No issue
Siblings Count,int64,0.0,0.0,0.0,5.0,No issue
English_1st_Sem,float64,0.0,,0.0,100.0,No issue


Number of All Scatter Plots = 253
All Plots done
Time to run AutoViz = 63 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


In [22]:
df = AV.AutoViz('dummy_student_data_10000.csv', depVar="Overall Percentage (5th Sem)")

Shape of your Data Set loaded: (10000, 33)
#######################################################################################
######################## C L A S S I F Y I N G  V A R I A B L E S  ####################
#######################################################################################
Classifying variables in data set...
    Number of Numeric Columns =  22
    Number of Integer-Categorical Columns =  4
    Number of String-Categorical Columns =  4
    Number of Factor-Categorical Columns =  0
    Number of String-Boolean Columns =  1
    Number of Numeric-Boolean Columns =  0
    Number of Discrete String Columns =  0
    Number of NLP String Columns =  0
    Number of Date Time Columns =  0
    Number of ID Columns =  1
    Number of Columns to Delete =  0
    32 Predictors classified...
        1 variable(s) removed since they were ID or low-information variables
        List of variables removed: ['Roll No']

################ Regression problem #################

Unnamed: 0,Data Type,Missing Values%,Unique Values%,Minimum Value,Maximum Value,DQ Issue
Math_1st_Sem,float64,0.0,,0.0,100.0,No issue
Gender,object,0.0,0.0,,,No issue
10th Percentage,int64,0.0,1.0,0.0,100.0,No issue
12th Percentage,int64,0.0,1.0,0.0,100.0,No issue
12th Stream,object,0.0,0.0,,,No issue
Family Count,int64,0.0,0.0,0.0,10.0,No issue
Father Occupation,object,0.0,0.0,,,No issue
Mother Occupation,object,0.0,0.0,,,No issue
Siblings Count,int64,0.0,0.0,0.0,5.0,No issue
Target Group,object,0.0,0.0,,,No issue


All Plots done
Time to run AutoViz = 7 seconds 

 ###################### AUTO VISUALIZATION Completed ########################


In [23]:
# !python -m pip install sweetviz

In [24]:
import sweetviz as sv

In [25]:
report = sv.analyze([data,'data'], target_feat='Overall Percentage (5th Sem)')

                                             |          | [  0%]   00:00 -> (? left)

In [26]:
report.show_html('Report.html')

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [27]:
# !python -m pip install pandas-profiling

In [28]:
# from pandas_profiling import ProfileReport

In [29]:
# profile = ProfileReport(df)
# profile

In [30]:
data.select_dtypes(include=[np.number]).columns

Index(['Roll No', '10th Percentage', '12th Percentage', 'Family Count',
       'Siblings Count', 'English_1st_Sem', 'Math_1st_Sem',
       'Basic Science_1st_Sem', 'ICT_1st_Sem', 'WPC_1st_Sem', 'EEC_2st_Sem',
       'AMI_2st_Sem', 'BEC_2st_Sem', 'PCI_2st_Sem', 'BCC_2st_Sem',
       'CPH_2st_Sem', 'WPD_2st_Sem', 'OOP_3st_Sem', 'DSU_3st_Sem',
       'CGR_3st_Sem', 'DMS_3st_Sem', 'DTE_3st_Sem', 'JPR_4st_Sem',
       'SEN_4st_Sem', 'DCC_4st_Sem', 'MIC_4st_Sem', 'GAD_4st_Sem',
       'Overall Percentage (5th Sem)'],
      dtype='object')

In [31]:
data['Target Group'].unique()

array(['35 to 50', '<35', '75 to 100', '50 to 75'], dtype=object)

In [32]:
data.columns[data.columns.str.contains('Sem')]

Index(['English_1st_Sem', 'Math_1st_Sem', 'Basic Science_1st_Sem',
       'ICT_1st_Sem', 'WPC_1st_Sem', 'EEC_2st_Sem', 'AMI_2st_Sem',
       'BEC_2st_Sem', 'PCI_2st_Sem', 'BCC_2st_Sem', 'CPH_2st_Sem',
       'WPD_2st_Sem', 'OOP_3st_Sem', 'DSU_3st_Sem', 'CGR_3st_Sem',
       'DMS_3st_Sem', 'DTE_3st_Sem', 'JPR_4st_Sem', 'SEN_4st_Sem',
       'DCC_4st_Sem', 'MIC_4st_Sem', 'GAD_4st_Sem',
       'Overall Percentage (5th Sem)'],
      dtype='object')

In [35]:
# Create boxplots to visualize central tendency and distribution (outliers) for numerical features by target group
for col in data.columns[data.columns.str.contains('Sem')]:
    sns.histplot(x=col, data=data)  # Show means with boxplots
    plt.title(f"Distribution of {col} ")
    plt.show()

In [36]:
# Create boxplots to visualize central tendency and distribution (outliers) for numerical features by target group
for col in list(set(numerical_col)-set(['Roll No'])):
    sns.boxplot(x='Target Group', y=col, showmeans=True, data=data,order=['<35','35 to 50', '50 to 75','75 to 100'])  # Show means with boxplots
    plt.title(f"Distribution of {col} by Target Group (Boxplot)")
    plt.show()

In [37]:
# Create countplots to visualize categorical feature distribution by target group
for col in data.select_dtypes(include=['object']).columns:
    sns.countplot(x=col, hue='Target Group', data=data)
    plt.title(f"Distribution of {col} by Target Group (Countplot)")
    plt.show()

In [38]:
categorical_cols

['Gender',
 '12th Stream',
 'Father Occupation',
 'Mother Occupation',
 'Target Group']

In [39]:
for i in categorical_cols:
    display(data[i].value_counts())
    display(data[i].unique())
    print("\n")

Male      5013
Female    4987
Name: Gender, dtype: int64

array(['Male', 'Female'], dtype=object)





Science     3448
Commerce    3367
Art         3185
Name: 12th Stream, dtype: int64

array(['Art', 'Science', 'Commerce'], dtype=object)





Private Job       1487
Housewife         1487
No Job            1428
Farmer            1423
Business          1406
Army              1398
Government Job    1371
Name: Father Occupation, dtype: int64

array(['Farmer', 'Army', 'Private Job', 'No Job', 'Business',
       'Government Job', 'Housewife'], dtype=object)





Business          1455
No Job            1446
Government Job    1441
Farmer            1433
Army              1419
Private Job       1416
Housewife         1390
Name: Mother Occupation, dtype: int64

array(['Private Job', 'Housewife', 'No Job', 'Government Job', 'Farmer',
       'Business', 'Army'], dtype=object)





<35          3478
75 to 100    2546
50 to 75     2435
35 to 50     1541
Name: Target Group, dtype: int64

array(['35 to 50', '<35', '75 to 100', '50 to 75'], dtype=object)





# Data Preprocessing

In [40]:
gender_mapping = {'Male': 1, 'Female': 0}
data['Gender'] = data['Gender'].map(gender_mapping)

stream_12_mapping = {'Science': 2, 'Commerce': 1,'Art':0}
data['12th Stream'] = data['12th Stream'].map(stream_12_mapping)

In [41]:
data.head(10)

Unnamed: 0,Roll No,Gender,10th Percentage,12th Percentage,12th Stream,Family Count,Father Occupation,Mother Occupation,Siblings Count,English_1st_Sem,Math_1st_Sem,Basic Science_1st_Sem,ICT_1st_Sem,WPC_1st_Sem,EEC_2st_Sem,AMI_2st_Sem,BEC_2st_Sem,PCI_2st_Sem,BCC_2st_Sem,CPH_2st_Sem,WPD_2st_Sem,OOP_3st_Sem,DSU_3st_Sem,CGR_3st_Sem,DMS_3st_Sem,DTE_3st_Sem,JPR_4st_Sem,SEN_4st_Sem,DCC_4st_Sem,MIC_4st_Sem,GAD_4st_Sem,Overall Percentage (5th Sem),Target Group
0,1,1,50,52,0,3,Farmer,Private Job,1,84.4,60.8,88.8,59.6,76.0,67.6,78.4,78.8,74.4,68.0,62.8,64.4,89.6,56.8,79.2,90.0,51.2,70.0,64.8,80.4,70.8,82.0,45,35 to 50
1,2,0,70,90,2,1,Army,Housewife,0,10.0,12.2,8.3,9.5,9.6,10.9,9.8,11.9,6.6,7.2,8.6,11.5,12.1,7.3,11.3,14.7,12.5,5.6,14.0,10.0,12.5,8.1,5,<35
2,3,1,95,80,0,5,Private Job,No Job,5,31.0,11.0,18.0,46.0,37.0,81.0,91.0,13.0,85.0,0.0,41.0,40.0,87.0,41.0,26.0,56.0,37.0,54.0,100.0,32.0,77.0,56.0,77,75 to 100
3,4,1,47,65,0,8,No Job,Government Job,1,9.0,9.0,98.0,40.0,54.0,38.0,43.0,71.0,90.0,73.0,33.0,53.0,51.0,80.0,15.0,46.0,40.0,53.0,87.0,43.0,47.0,9.0,87,75 to 100
4,5,1,18,30,0,9,Business,Housewife,5,15.0,14.5,12.0,12.3,9.2,9.5,8.2,10.8,8.6,5.7,7.9,6.9,7.6,14.8,9.8,7.5,10.3,14.5,12.5,6.6,14.2,5.1,32,<35
5,6,0,5,60,2,10,Private Job,Farmer,4,37.0,26.0,31.0,70.0,47.0,64.0,48.0,70.0,57.0,74.0,61.0,6.0,5.0,0.0,99.0,18.0,33.0,34.0,48.0,34.0,57.0,12.0,85,75 to 100
6,7,1,83,63,2,9,Business,Business,4,60.0,84.0,72.8,80.0,59.2,64.0,83.2,88.4,73.2,63.2,84.4,74.8,52.0,77.2,65.2,62.0,65.2,56.4,60.4,52.8,66.8,56.8,38,35 to 50
7,8,1,26,100,0,7,Farmer,No Job,3,100.0,100.0,100.0,100.0,96.7,91.1,100.0,80.6,90.4,100.0,100.0,100.0,79.2,77.8,100.0,93.9,100.0,100.0,100.0,100.0,81.3,100.0,74,50 to 75
8,9,1,52,97,1,3,Army,No Job,5,6.5,13.9,11.9,11.4,6.2,8.8,14.7,5.5,5.2,8.7,5.7,8.9,11.9,8.9,6.7,6.6,5.0,6.5,5.7,8.1,5.0,7.1,30,<35
9,10,0,100,33,1,1,Army,Business,0,5.0,11.0,8.0,83.0,58.0,8.0,78.0,63.0,93.0,48.0,67.0,92.0,18.0,4.0,73.0,73.0,87.0,61.0,14.0,92.0,37.0,91.0,91,75 to 100


In [42]:
modelling_df=data.copy()

In [43]:
modelling_df.columns.tolist()

['Roll No',
 'Gender',
 '10th Percentage',
 '12th Percentage',
 '12th Stream',
 'Family Count',
 'Father Occupation',
 'Mother Occupation',
 'Siblings Count',
 'English_1st_Sem',
 'Math_1st_Sem',
 'Basic Science_1st_Sem',
 'ICT_1st_Sem',
 'WPC_1st_Sem',
 'EEC_2st_Sem',
 'AMI_2st_Sem',
 'BEC_2st_Sem',
 'PCI_2st_Sem',
 'BCC_2st_Sem',
 'CPH_2st_Sem',
 'WPD_2st_Sem',
 'OOP_3st_Sem',
 'DSU_3st_Sem',
 'CGR_3st_Sem',
 'DMS_3st_Sem',
 'DTE_3st_Sem',
 'JPR_4st_Sem',
 'SEN_4st_Sem',
 'DCC_4st_Sem',
 'MIC_4st_Sem',
 'GAD_4st_Sem',
 'Overall Percentage (5th Sem)',
 'Target Group']

In [44]:
modelling_df.drop(columns=['Roll No','Mother Occupation','Overall Percentage (5th Sem)'],axis=1,inplace=True)
modelling_df.shape

(10000, 30)

In [45]:
target_mapping = {'35 to 50':1, '<35':0, '75 to 100':3, '50 to 75':2}
modelling_df['Target Group'] = modelling_df['Target Group'].map(target_mapping)

In [46]:
modelling_df = pd.get_dummies(modelling_df)
modelling_df.shape

(10000, 36)

# Model Building

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [62]:
# Define features (X) and target variable (y)
X = modelling_df.drop(['Target Group'], axis=1)  # Features
y = modelling_df['Target Group']  # Target variable
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
# Initialize XGBoost classifier
model = XGBClassifier()
# Train the model
model.fit(X_train, y_train)
# Make predictions on the testing data
predictions = model.predict(X_test)

In [64]:

# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))



Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       728
           1       1.00      1.00      1.00       278
           2       1.00      1.00      1.00       489
           3       1.00      1.00      1.00       505

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [65]:
# Confusion matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)

Confusion Matrix:
[[728   0   0   0]
 [  0 278   0   0]
 [  0   0 489   0]
 [  0   0   0 505]]


In [53]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [57]:
# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
# Make predictions on the testing data
predictions = rf_classifier.predict(X_test)

In [58]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

# Confusion matrix
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       728
           1       1.00      1.00      1.00       278
           2       1.00      1.00      1.00       489
           3       1.00      1.00      1.00       505

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
[[728   0   0   0]
 [  0 278   0   0]
 [  0   0 489   0]
 [  0   0   0 505]]


In [None]:
# Define the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Use the best model to make predictions
best_rf_classifier = grid_search.best_estimator_
y_pred = best_rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [59]:

feature_importances = rf_classifier.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances for Random Forest Model')
plt.show()

In [60]:
modelling_df.head()

Unnamed: 0,Gender,10th Percentage,12th Percentage,12th Stream,Family Count,Siblings Count,English_1st_Sem,Math_1st_Sem,Basic Science_1st_Sem,ICT_1st_Sem,WPC_1st_Sem,EEC_2st_Sem,AMI_2st_Sem,BEC_2st_Sem,PCI_2st_Sem,BCC_2st_Sem,CPH_2st_Sem,WPD_2st_Sem,OOP_3st_Sem,DSU_3st_Sem,CGR_3st_Sem,DMS_3st_Sem,DTE_3st_Sem,JPR_4st_Sem,SEN_4st_Sem,DCC_4st_Sem,MIC_4st_Sem,GAD_4st_Sem,Target Group,Father Occupation_Army,Father Occupation_Business,Father Occupation_Farmer,Father Occupation_Government Job,Father Occupation_Housewife,Father Occupation_No Job,Father Occupation_Private Job
0,1,50,52,0,3,1,84.4,60.8,88.8,59.6,76.0,67.6,78.4,78.8,74.4,68.0,62.8,64.4,89.6,56.8,79.2,90.0,51.2,70.0,64.8,80.4,70.8,82.0,1,0,0,1,0,0,0,0
1,0,70,90,2,1,0,10.0,12.2,8.3,9.5,9.6,10.9,9.8,11.9,6.6,7.2,8.6,11.5,12.1,7.3,11.3,14.7,12.5,5.6,14.0,10.0,12.5,8.1,0,1,0,0,0,0,0,0
2,1,95,80,0,5,5,31.0,11.0,18.0,46.0,37.0,81.0,91.0,13.0,85.0,0.0,41.0,40.0,87.0,41.0,26.0,56.0,37.0,54.0,100.0,32.0,77.0,56.0,3,0,0,0,0,0,0,1
3,1,47,65,0,8,1,9.0,9.0,98.0,40.0,54.0,38.0,43.0,71.0,90.0,73.0,33.0,53.0,51.0,80.0,15.0,46.0,40.0,53.0,87.0,43.0,47.0,9.0,3,0,0,0,0,0,1,0
4,1,18,30,0,9,5,15.0,14.5,12.0,12.3,9.2,9.5,8.2,10.8,8.6,5.7,7.9,6.9,7.6,14.8,9.8,7.5,10.3,14.5,12.5,6.6,14.2,5.1,0,0,1,0,0,0,0,0


In [61]:
modelling_df.columns

Index(['Gender', '10th Percentage', '12th Percentage', '12th Stream',
       'Family Count', 'Siblings Count', 'English_1st_Sem', 'Math_1st_Sem',
       'Basic Science_1st_Sem', 'ICT_1st_Sem', 'WPC_1st_Sem', 'EEC_2st_Sem',
       'AMI_2st_Sem', 'BEC_2st_Sem', 'PCI_2st_Sem', 'BCC_2st_Sem',
       'CPH_2st_Sem', 'WPD_2st_Sem', 'OOP_3st_Sem', 'DSU_3st_Sem',
       'CGR_3st_Sem', 'DMS_3st_Sem', 'DTE_3st_Sem', 'JPR_4st_Sem',
       'SEN_4st_Sem', 'DCC_4st_Sem', 'MIC_4st_Sem', 'GAD_4st_Sem',
       'Target Group', 'Father Occupation_Army', 'Father Occupation_Business',
       'Father Occupation_Farmer', 'Father Occupation_Government Job',
       'Father Occupation_Housewife', 'Father Occupation_No Job',
       'Father Occupation_Private Job'],
      dtype='object')

In [66]:
import joblib

# Assuming your XGBoost model is named 'model'
# Save the model to a pickle file
joblib.dump(model, 'xgboost_model.pkl')

['xgboost_model.pkl']

In [67]:
modelling_df.columns

Index(['Gender', '10th Percentage', '12th Percentage', '12th Stream',
       'Family Count', 'Siblings Count', 'English_1st_Sem', 'Math_1st_Sem',
       'Basic Science_1st_Sem', 'ICT_1st_Sem', 'WPC_1st_Sem', 'EEC_2st_Sem',
       'AMI_2st_Sem', 'BEC_2st_Sem', 'PCI_2st_Sem', 'BCC_2st_Sem',
       'CPH_2st_Sem', 'WPD_2st_Sem', 'OOP_3st_Sem', 'DSU_3st_Sem',
       'CGR_3st_Sem', 'DMS_3st_Sem', 'DTE_3st_Sem', 'JPR_4st_Sem',
       'SEN_4st_Sem', 'DCC_4st_Sem', 'MIC_4st_Sem', 'GAD_4st_Sem',
       'Target Group', 'Father Occupation_Army', 'Father Occupation_Business',
       'Father Occupation_Farmer', 'Father Occupation_Government Job',
       'Father Occupation_Housewife', 'Father Occupation_No Job',
       'Father Occupation_Private Job'],
      dtype='object')