In [1]:
!pip install --quiet xlrd
print('pip install xlrd complete')

pip install xlrd complete


In [2]:
import pandas as pd
import numpy as np


DATA = '/kaggle/input/university-students-marks-sheet/Univeristy_Results.xls'

sheet_names = ['Final', 'Sessional', 'Table']

final_df = pd.read_excel(io=DATA, sheet_name=sheet_names[0], engine='xlrd', )
session_df = pd.read_excel(io=DATA, sheet_name=sheet_names[1], engine='xlrd',)
table_df = pd.read_excel(io=DATA, sheet_name=sheet_names[2], engine='xlrd',)
session_df.columns = ['roll', 'name', 'Q1', 'Q2', 'AV1', 'A1', 'A2', 'AV2', 'M', 'Total']
session_df = session_df[session_df.index != 0].head(216)
numeric_columns = [item for item in session_df.columns if item != 'name']
session_df['roll'] = session_df['roll'].apply(func=lambda x: x if isinstance(x, float) or isinstance(x, int) or x.strip() == '' else x)
for column in numeric_columns:
    session_df[column] = session_df[column].apply(func=lambda x: x.strip() if isinstance(x, str) else x)
session_df[numeric_columns] = session_df[numeric_columns].replace('', '0').astype(float).fillna(value=0)
session_df.head()

Unnamed: 0,roll,name,Q1,Q2,AV1,A1,A2,AV2,M,Total
1,1.0,Ashir Mehfooz,14.0,14.0,14.0,13.0,13.0,13.0,41.0,68.0
2,2.0,Atif Raftad,4.0,10.0,7.0,4.0,5.0,4.5,30.0,41.5
3,3.0,Saiqa Aziz,15.0,11.0,13.0,14.0,13.0,13.5,34.0,60.5
4,8.0,Ozair Minhas,6.0,5.0,5.5,4.0,6.0,5.0,12.0,22.5
5,9.0,Naveera Subhani,5.0,11.0,8.0,4.0,5.0,4.5,34.0,46.5


It looks like we have two assignments, two quizzes, and two averages, plus a midterm. Let's see what weights we get if we treat all of them as first-order values.

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

scores = ['Q1', 'Q2', 'AV1', 'A1', 'A2', 'AV2', 'M',] 
X_train, X_test, y_train, y_test = train_test_split(session_df[scores], session_df['Total'], test_size=0.25, random_state=2024)

linear = LinearRegression()
linear.fit(X_train, y_train)
print('r2: {:5.4f}'.format(r2_score(y_true=y_test, y_pred=linear.predict(X_test))))


r2: 0.9458


In [4]:
from plotly.express import pie
pie(names=scores, values=linear.coef_, color=linear.coef_)

Now let's try again and ignore the averages, as we suspect they are double-counting something.

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from plotly.express import pie

no_averages = ['Q1', 'Q2', 'A1', 'A2', 'M',] 
X_train, X_test, y_train, y_test = train_test_split(session_df[no_averages], session_df['Total'], test_size=0.25, random_state=2024)

linear_no_avg = LinearRegression()
linear_no_avg.fit(X_train, y_train)
print('r2: {:5.4f}'.format(r2_score(y_true=y_test, y_pred=linear_no_avg.predict(X_test))))
pie(names=no_averages, values=linear_no_avg.coef_, color=linear_no_avg.coef_).show()

r2: 0.9458


This seems more plausible as it looks like the two assignments and the two quizzes are weighted 15% each and the midterm is 40%. Where are the errors coming from? Probably from two sources:
* We have filled in missing values with zeros and that may or may not be correct
* Points have been reported in half-point increments, which introduces some rounding error
* Some students apparently withdrew and got zero total scores while having nonzero assignment/quiz scores


In [6]:
session_df[session_df['Total'] == 0]

Unnamed: 0,roll,name,Q1,Q2,AV1,A1,A2,AV2,M,Total
7,11.0,Seher Ishtiaq,12.0,12.0,12.0,13.0,12.0,12.5,0.0,0.0
13,18.0,Ahmed Ali,13.0,14.0,13.5,7.0,14.0,10.5,0.0,0.0
27,33.0,Muhammad Ishtiaq,6.0,13.0,9.5,0.0,6.0,3.0,0.0,0.0
38,0.0,Aslam Ch,14.0,2.0,8.0,13.0,4.0,8.5,0.0,0.0
61,0.0,Talha,5.0,3.0,4.0,13.0,2.0,7.5,0.0,0.0
76,82.0,Büşra,9.0,1.0,5.0,14.0,0.0,7.0,0.0,0.0
100,106.0,James Miller,14.0,15.0,14.5,4.0,14.0,9.0,0.0,0.0
125,131.0,Yusuf,4.0,6.0,5.0,13.0,2.0,7.5,0.0,0.0
155,161.0,Fadi Nabil,13.0,5.0,9.0,13.0,5.0,9.0,0.0,0.0
179,185.0,Erkan,4.0,6.0,5.0,3.0,5.0,4.0,0.0,0.0


Let's remove the students with non-zero partial scores and zero total scores and rebuild our model.

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from plotly.express import pie

no_averages = ['Q1', 'Q2', 'A1', 'A2', 'M',] 
no_zero_df = session_df[session_df['Total'] != 0]
X_train, X_test, y_train, y_test = train_test_split(no_zero_df[no_averages], no_zero_df['Total'], test_size=0.25, random_state=2024)

linear2 = LinearRegression()
linear2.fit(X_train, y_train)
print('r2: {:5.4f}'.format(r2_score(y_true=y_test, y_pred=linear2.predict(X_test))))
pie(names=no_averages, values=linear2.coef_, color=linear2.coef_).show()

r2: 1.0000


This makes more sense. The midterm counts twice and the other scores count once.

In [8]:
from plotly.express import scatter
session_df['synthetic'] = session_df['M'] + 0.5 * session_df[['A1', 'A2', 'Q1', 'Q2', ]].sum(axis=1)
scatter(data_frame=session_df, x='Total', y='synthetic', hover_name='name', trendline='ols')