# BITSAT Odyssey: Navigating Students' Success in College


### Importing the necessary libraries


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from collections import defaultdict
from pandas import isna

### Getting the data


In [2]:
df = pd.read_csv("final-dataset.csv")

In [3]:
df.head()

Unnamed: 0,Timestamp,Your BITSAT (or SAT) Score,Maximum BITSAT Score,Field of Study,Semester Wise Scores [9.51 - 10.00],Semester Wise Scores [9.01 - 9.50],Semester Wise Scores [8.51 - 9.00],Semester Wise Scores [8.01 - 8.50],Semester Wise Scores [7.51 - 8.00],Semester Wise Scores [7.01 - 7.50],...,Semester Wise Scores [5.51 - 6.00],Semester Wise Scores [5.01 - 5.50],Semester Wise Scores [4.50 - 5.00],Current/Graduating CGPA,Current Semester,How many courses have you withdrawn from?,Your Pronouns (optional),Semester Wise Scores [Less than 4.50],Maximum SAT Score,Are you an international student?
0,11/8/2023 19:03:18,282,450,A3,,1-1 SGPA,2-2 SGPA,2-1 SGPA,1-2 SGPA,,...,,,,8.45,5,1,,,,
1,11/8/2023 19:07:00,335,390,A7,,1-2 SGPA,1-1 SGPA,,,,...,,,,8.88,3,1,he/him,,,
2,11/8/2023 19:11:08,358,450,A7,,,2-1 SGPA,1-2 SGPA,2-2 SGPA,1-1 SGPA,...,,,,8.28,5,1,He/Him,,,
3,11/8/2023 19:28:09,281,450,"B4, AA",,,1-1 SGPA,,1-2 SGPA,"2-1 SGPA, 3-1 SGPA, 3-2 SGPA",...,,,,7.5,7,0,,,,
4,11/8/2023 19:33:39,281,450,A4,,,,,3-2 SGPA,"1-1 SGPA, 1-2 SGPA, 3-1 SGPA",...,,,,7.4,7,2,kezzy/uwu,,,


### BITSAT Score v/s Latest CGPA Analysis

For the analysis, we will use mainly 3 features of the data set, namely:

- BITSAT Score
- Current CGPA (Final CGPA, if already graduated)
- Maximum possible BITSAT Score


In [4]:
bitsat_current = pd.DataFrame(
    df[
        [
            "Your BITSAT (or SAT) Score",
            "Current/Graduating CGPA",
            "Maximum BITSAT Score",
        ]
    ].values
)

In [5]:
bitsat_current.rename(
    columns={0: "bitsat_score", 1: "latest_cgpa", 2: "max_bitsat_score"}, inplace=True
)

#### Removing data corresponding to students who gave the SAT

We mark the rows that contain the records of the students who did not take the BITSAT


In [6]:
"""Syntax:
df2 = df.applymap(lambda x: 1 if x >=25 else 0)
"""

bitsat_current["max_bitsat_score"] = bitsat_current["max_bitsat_score"].apply(
    lambda x: x if x.isdigit() else "NaN"
)

We keep only those rows that contain data of students who took the BITSAT


In [7]:
bitsat_current = bitsat_current.loc[bitsat_current["max_bitsat_score"] != "NaN"]

In [8]:
bitsat_current = bitsat_current.astype(
    {"bitsat_score": int, "latest_cgpa": float, "max_bitsat_score": str}
)

In [9]:
bitsat_current = bitsat_current.loc[bitsat_current["bitsat_score"] < 500]
bitsat_current.reset_index(inplace=True, drop=True)

In [10]:
bitsat_current.sort_values("latest_cgpa").head()

Unnamed: 0,bitsat_score,latest_cgpa,max_bitsat_score
25,264,5.54,450
86,192,5.78,390
21,208,6.07,450
19,249,6.16,450
51,180,6.54,390


In [11]:
bitsat_current.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bitsat_score      99 non-null     int32  
 1   latest_cgpa       99 non-null     float64
 2   max_bitsat_score  99 non-null     object 
dtypes: float64(1), int32(1), object(1)
memory usage: 2.1+ KB


In [12]:
bitsat_current_cgpa_plot = px.scatter(
    bitsat_current,
    x="latest_cgpa",
    y="bitsat_score",
    color=bitsat_current["max_bitsat_score"],
    color_discrete_map={
        "390": "rgb(0, 48, 73)",
        "450": "rgb(247, 127, 0)",
    },
    title="BITSAT v/s Current CGPA",
    labels={"max_bitsat_score": "Max BITSAT Score"},
)

bitsat_current_cgpa_plot.update_layout(
    xaxis_title="Latest CGPA",
    yaxis_title="BITSAT Score",
)

bitsat_current_cgpa_plot.show()

---


### Correlations

Given the BITSAT score, we will try to find some insights between latest CGPA and the number of courses dropped. We consider the following features of the data set for this analysis:

- BITSAT Score
- Latest CGPA
- Number of courses dropped/withdrawn from
- Maximum BITSAT Score


In [13]:
bitsat_cg_dropped = pd.DataFrame(
    df[
        [
            "Your BITSAT (or SAT) Score",
            "Current/Graduating CGPA",
            "How many courses have you withdrawn from?",
            "Maximum BITSAT Score",
        ]
    ].values
)

In [14]:
bitsat_cg_dropped.head(3)

Unnamed: 0,0,1,2,3
0,282,8.45,1,450
1,335,8.88,1,390
2,358,8.28,1,450


#### Renaming the columns


In [15]:
bitsat_cg_dropped.rename(
    columns={0: "bitsat_score", 1: "cgpa", 2: "dropped_courses", 3: "max_bitsat_score"},
    inplace=True,
)

#### Removing data corresponding to students who gave the SAT


In [16]:
bitsat_cg_dropped["max_bitsat_score"] = bitsat_cg_dropped["max_bitsat_score"].apply(
    lambda x: x if x.isdigit() else -1
)

In [17]:
bitsat_cg_dropped = bitsat_cg_dropped.loc[
    (bitsat_cg_dropped["bitsat_score"] < 500)
    & (bitsat_cg_dropped["max_bitsat_score"] != -1)
]
bitsat_cg_dropped.reset_index(drop=True, inplace=True)

In [18]:
bitsat_cg_dropped.head(3)

Unnamed: 0,bitsat_score,cgpa,dropped_courses,max_bitsat_score
0,282,8.45,1,450
1,335,8.88,1,390
2,358,8.28,1,450


#### Setting explicit data types for the columns for ease of operation


In [19]:
bitsat_cg_dropped = bitsat_cg_dropped.astype(
    {
        "bitsat_score": int,
        "cgpa": float,
        "dropped_courses": int,
        "max_bitsat_score": int,
    },
)

In [20]:
bitsat_cg_dropped.describe()

Unnamed: 0,bitsat_score,cgpa,dropped_courses,max_bitsat_score
count,99.0,99.0,99.0,99.0
mean,289.545455,8.07797,0.69697,433.636364
std,44.651776,0.83022,0.919789,26.857695
min,180.0,5.54,0.0,390.0
25%,261.0,7.59,0.0,390.0
50%,287.0,8.1,0.0,450.0
75%,330.0,8.585,1.0,450.0
max,370.0,9.81,4.0,450.0


#### Creating Intervals for the BITSAT Score


In [21]:
bin_intervals = [180, 210, 240, 270, 300, 330, 360, 390, 420, np.inf]
bin_labels = [
    "180-209",
    "210-239",
    "240-269",
    "270-299",
    "300-329",
    "330-359",
    "360-389",
    "390-419",
    "420+",
]
bitsat_cg_dropped["score_intervals"] = pd.cut(
    bitsat_cg_dropped["bitsat_score"],
    bins=bin_intervals,
    labels=bin_labels,
    right=True,
    precision=0,
    ordered=True,
)

In [22]:
bitsat_cg_dropped.head()

Unnamed: 0,bitsat_score,cgpa,dropped_courses,max_bitsat_score,score_intervals
0,282,8.45,1,450,270-299
1,335,8.88,1,390,330-359
2,358,8.28,1,450,330-359
3,281,7.5,0,450,270-299
4,281,7.4,2,450,270-299


In [23]:
bitsat_cg_dropped_plotting_data = bitsat_cg_dropped.copy()

### BITSAT 450 and BITSAT 390

- Up until 2022, the BITSAT was out of a maximum 450 marks
- It was later changed to 390 marks
- This is why we split our dataset into two –
  - `BITSAT 390`: corresponding to the test out of 390
  - `BITSAT 450`: corresponding to the test out of 450


In [24]:
bitsat_cg_dropped_plotting_data_390 = bitsat_cg_dropped.copy()
bitsat_cg_dropped_plotting_data_450 = bitsat_cg_dropped.copy()

In [25]:
bitsat_cg_dropped_plotting_data_390 = bitsat_cg_dropped_plotting_data_390.loc[
    bitsat_cg_dropped_plotting_data_390["max_bitsat_score"] == 390
]


bitsat_cg_dropped_plotting_data_450 = bitsat_cg_dropped_plotting_data_450.loc[
    bitsat_cg_dropped_plotting_data_450["max_bitsat_score"] == 450
]

### Aggregation Functions

We use the below functions while merging the rows

- `CGPA`: mean
- `BITSAT Score`: mean
- `Dropped Courses`: mean


In [26]:
aggregation_functions = {
    "cgpa": "mean",
    "bitsat_score": "mean",
    "dropped_courses": "mean",
}

### BITSAT 390


##### Merging rows

- We merge the rows together based on the BITSAT score interval
- The above aggregation functions are used to merge the rows


In [27]:
bitsat_cg_dropped_plotting_data_390 = bitsat_cg_dropped_plotting_data_390.groupby(
    bitsat_cg_dropped_plotting_data_390["score_intervals"]
).aggregate(aggregation_functions)

In [28]:
bitsat_cg_dropped_plotting_data_390

Unnamed: 0_level_0,cgpa,bitsat_score,dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.9,195.333333,0.333333
210-239,7.838333,223.666667,0.0
240-269,8.468333,252.5,0.0
270-299,7.923625,284.625,0.5
300-329,9.73,315.0,0.0
330-359,9.345,346.5,1.0
360-389,,,
390-419,,,
420+,,,


We drop the rows corresponding to a score more than $390$ since those correspond to `BITSAT 450`


In [29]:
"""Syntax:
df.drop([0, 1])
"""

bitsat_cg_dropped_plotting_data_390.drop(["390-419", "420+"], inplace=True)

In [30]:
"""Syntax:
energy.rename(index={'Republic of Korea':'South Korea'},inplace=True)
"""

bitsat_cg_dropped_plotting_data_390.rename(index={"330-359": "330-350+"}, inplace=True)

In [31]:
bitsat_cg_dropped_plotting_data_390

Unnamed: 0_level_0,cgpa,bitsat_score,dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.9,195.333333,0.333333
210-239,7.838333,223.666667,0.0
240-269,8.468333,252.5,0.0
270-299,7.923625,284.625,0.5
300-329,9.73,315.0,0.0
330-350+,9.345,346.5,1.0
360-389,,,


In [32]:
bitsat_cg_dropped_plotting_data_390.rename(
    columns={
        "cgpa": "mean_cgpa",
        "bitsat_score": "mean_bitsat_score",
        "dropped_courses": "mean_dropped_courses",
    },
    inplace=True,
)

In [33]:
bitsat_cg_dropped_plotting_data_390

Unnamed: 0_level_0,mean_cgpa,mean_bitsat_score,mean_dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.9,195.333333,0.333333
210-239,7.838333,223.666667,0.0
240-269,8.468333,252.5,0.0
270-299,7.923625,284.625,0.5
300-329,9.73,315.0,0.0
330-350+,9.345,346.5,1.0
360-389,,,


In [34]:
bitsat_cg_dropped_plotting_data.rename(
    columns={
        "cgpa": "mean_cgpa",
        "bitsat_score": "mean_bitsat_score",
        "dropped_courses": "mean_dropped_courses",
    },
    inplace=True,
)

#### Plotting 390


In [35]:
"""Syntax:
fig = px.bar(data_canada, x='year', y='pop')
"""

score_interval_bitsat_390 = px.bar(
    bitsat_cg_dropped_plotting_data_390,
    x=bitsat_cg_dropped_plotting_data_390.index,
    y=bitsat_cg_dropped_plotting_data_390["mean_bitsat_score"],
)

score_interval_bitsat_390.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average BITSAT Interval Score",
    title="Average Interval Score v/s BITSAT 390",
)

score_interval_bitsat_390.show()

In [36]:
bitsat_cg_dropped_plotting_data_390

Unnamed: 0_level_0,mean_cgpa,mean_bitsat_score,mean_dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.9,195.333333,0.333333
210-239,7.838333,223.666667,0.0
240-269,8.468333,252.5,0.0
270-299,7.923625,284.625,0.5
300-329,9.73,315.0,0.0
330-350+,9.345,346.5,1.0
360-389,,,


In [37]:
score_interval_cgpa_390 = px.bar(
    bitsat_cg_dropped_plotting_data_390,
    x=bitsat_cg_dropped_plotting_data_390.index,
    y=bitsat_cg_dropped_plotting_data_390["mean_cgpa"],
    color=bitsat_cg_dropped_plotting_data_390.index,
    color_discrete_sequence=[
        px.colors.sample_colorscale("purples", v)[0]
        if not isna(v) else 0
        for v in (
            bitsat_cg_dropped_plotting_data_390["mean_cgpa"]
            / bitsat_cg_dropped_plotting_data_390["mean_cgpa"].max()
        ).tolist()
    ],
)

score_interval_cgpa_390.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average Latest CGPA",
    title="Average Latest CGPA v/s BITSAT 390",
)

score_interval_cgpa_390.show()

In [38]:
score_interval_dropped_390 = px.bar(
    bitsat_cg_dropped_plotting_data_390,
    x=bitsat_cg_dropped_plotting_data_390.index,
    y=bitsat_cg_dropped_plotting_data_390["mean_dropped_courses"],
)

score_interval_dropped_390.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average Courses Dropped",
    title="Average No. of Courses Dropped v/s BITSAT 390",
)

score_interval_dropped_390.show()

---


### BITSAT 450


In [39]:
bitsat_cg_dropped_plotting_data_450 = bitsat_cg_dropped_plotting_data_450.groupby(
    bitsat_cg_dropped_plotting_data_450["score_intervals"]
).aggregate(aggregation_functions)

In [40]:
bitsat_cg_dropped_plotting_data_450

Unnamed: 0_level_0,cgpa,bitsat_score,dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.07,208.0,1.0
210-239,8.02,223.5,0.0
240-269,7.770769,258.846154,0.384615
270-299,8.243333,284.952381,0.809524
300-329,8.277692,318.692308,1.0
330-359,8.165714,344.761905,1.238095
360-389,7.5,370.0,0.0
390-419,,,
420+,,,


In [41]:
bitsat_cg_dropped_plotting_data_450.rename(
    columns={
        "cgpa": "mean_cgpa",
        "bitsat_score": "mean_bitsat_score",
        "dropped_courses": "mean_dropped_courses",
    },
    inplace=True,
)

In [42]:
bitsat_cg_dropped_plotting_data_450

Unnamed: 0_level_0,mean_cgpa,mean_bitsat_score,mean_dropped_courses
score_intervals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180-209,6.07,208.0,1.0
210-239,8.02,223.5,0.0
240-269,7.770769,258.846154,0.384615
270-299,8.243333,284.952381,0.809524
300-329,8.277692,318.692308,1.0
330-359,8.165714,344.761905,1.238095
360-389,7.5,370.0,0.0
390-419,,,
420+,,,


#### Plotting BITSAT 450


In [43]:
"""
fig = px.bar(data_canada, x='year', y='pop')
"""
score_interval_bitsat_450 = px.bar(
    bitsat_cg_dropped_plotting_data_450,
    x=bitsat_cg_dropped_plotting_data_450.index,
    y=bitsat_cg_dropped_plotting_data_450["mean_bitsat_score"],
)

score_interval_bitsat_450.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average BITSAT Interval Score",
    title="Average Interval Score v/s BITSAT 450",
)

score_interval_bitsat_450.show()

In [44]:
score_interval_cgpa_450 = px.bar(
    bitsat_cg_dropped_plotting_data_450,
    x=bitsat_cg_dropped_plotting_data_450.index,
    y=bitsat_cg_dropped_plotting_data_450["mean_cgpa"],
    color=bitsat_cg_dropped_plotting_data_450.index,
    color_discrete_sequence=[
        px.colors.sample_colorscale("emrld", v)[0]
        if not isna(v) else 0
        for v in (
            bitsat_cg_dropped_plotting_data_450["mean_cgpa"]
            / bitsat_cg_dropped_plotting_data_450["mean_cgpa"].max()
        ).tolist()
    ],
)

score_interval_cgpa_450.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average Latest CGPA",
    title="Average Latest CGPA v/s BITSAT 450",
)

score_interval_cgpa_450.show()

In [45]:
score_interval_dropped_450 = px.bar(
    bitsat_cg_dropped_plotting_data_450,
    x=bitsat_cg_dropped_plotting_data_450.index,
    y=bitsat_cg_dropped_plotting_data_450["mean_dropped_courses"],
    color=bitsat_cg_dropped_plotting_data_450.index,
    color_discrete_sequence=[
        px.colors.sample_colorscale("pubugn", v)[0] if not isna(v) else 0
        for v in (
            bitsat_cg_dropped_plotting_data_450["mean_dropped_courses"]
            / bitsat_cg_dropped_plotting_data_450["mean_dropped_courses"].max()
        ).tolist()
    ],
)

score_interval_dropped_450.update_layout(
    xaxis_title="BITSAT Score Intervals",
    yaxis_title="Average Courses Dropped",
    title="Average No. of Courses Dropped v/s BITSAT 450",
    plot_bgcolor="rgb(255,255,255)",
)

score_interval_dropped_450.show()

---


### SGPA Data


In [46]:
df.columns

Index(['Timestamp', 'Your BITSAT (or SAT) Score', 'Maximum BITSAT Score',
       'Field of Study', 'Semester Wise Scores [9.51 - 10.00]',
       'Semester Wise Scores [9.01 - 9.50]',
       'Semester Wise Scores [8.51 - 9.00]',
       'Semester Wise Scores [8.01 - 8.50]',
       'Semester Wise Scores [7.51 - 8.00]',
       'Semester Wise Scores [7.01 - 7.50]',
       'Semester Wise Scores [6.51 - 7.00]',
       'Semester Wise Scores [6.01 - 6.50]',
       'Semester Wise Scores [5.51 - 6.00]',
       'Semester Wise Scores [5.01 - 5.50]',
       'Semester Wise Scores [4.50 - 5.00]', 'Current/Graduating CGPA',
       'Current Semester', 'How many courses have you withdrawn from?',
       'Your Pronouns (optional)', 'Semester Wise Scores [Less than 4.50]',
       'Maximum SAT Score', 'Are you an international student?'],
      dtype='object')

In [47]:
sgpa_data = df[
    [
        "Semester Wise Scores [9.51 - 10.00]",
        "Semester Wise Scores [9.01 - 9.50]",
        "Semester Wise Scores [8.51 - 9.00]",
        "Semester Wise Scores [8.01 - 8.50]",
        "Semester Wise Scores [7.51 - 8.00]",
        "Semester Wise Scores [7.01 - 7.50]",
        "Semester Wise Scores [6.51 - 7.00]",
        "Semester Wise Scores [6.01 - 6.50]",
        "Semester Wise Scores [5.51 - 6.00]",
        "Semester Wise Scores [5.01 - 5.50]",
        "Semester Wise Scores [4.50 - 5.00]",
    ]
].copy()

In [48]:
sgpa_data.head()

Unnamed: 0,Semester Wise Scores [9.51 - 10.00],Semester Wise Scores [9.01 - 9.50],Semester Wise Scores [8.51 - 9.00],Semester Wise Scores [8.01 - 8.50],Semester Wise Scores [7.51 - 8.00],Semester Wise Scores [7.01 - 7.50],Semester Wise Scores [6.51 - 7.00],Semester Wise Scores [6.01 - 6.50],Semester Wise Scores [5.51 - 6.00],Semester Wise Scores [5.01 - 5.50],Semester Wise Scores [4.50 - 5.00]
0,,1-1 SGPA,2-2 SGPA,2-1 SGPA,1-2 SGPA,,,,,,
1,,1-2 SGPA,1-1 SGPA,,,,,,,,
2,,,2-1 SGPA,1-2 SGPA,2-2 SGPA,1-1 SGPA,,,,,
3,,,1-1 SGPA,,1-2 SGPA,"2-1 SGPA, 3-1 SGPA, 3-2 SGPA",2-2 SGPA,,,,
4,,,,,3-2 SGPA,"1-1 SGPA, 1-2 SGPA, 3-1 SGPA",,2-1 SGPA,,,


In [49]:
sgpa_map = {
    "Semester Wise Scores [9.51 - 10.00]": 9.75,
    "Semester Wise Scores [9.01 - 9.50]": 9.25,
    "Semester Wise Scores [8.51 - 9.00]": 8.75,
    "Semester Wise Scores [8.01 - 8.50]": 8.25,
    "Semester Wise Scores [7.51 - 8.00]": 7.75,
    "Semester Wise Scores [7.01 - 7.50]": 7.25,
    "Semester Wise Scores [6.51 - 7.00]": 6.75,
    "Semester Wise Scores [6.01 - 6.50]": 6.25,
    "Semester Wise Scores [5.51 - 6.00]": 5.75,
    "Semester Wise Scores [5.01 - 5.50]": 5.25,
    "Semester Wise Scores [4.50 - 5.00]": 4.75,
}

In [50]:
first_sem_sgpa_count = defaultdict(int)

In [51]:
first_sem_sgpa_plot_data = {
    "Semester Wise Scores [9.51 - 10.00]": 0,
    "Semester Wise Scores [9.01 - 9.50]": 0,
    "Semester Wise Scores [8.51 - 9.00]": 0,
    "Semester Wise Scores [8.01 - 8.50]": 0,
    "Semester Wise Scores [7.51 - 8.00]": 0,
    "Semester Wise Scores [7.01 - 7.50]": 0,
    "Semester Wise Scores [6.51 - 7.00]": 0,
    "Semester Wise Scores [6.01 - 6.50]": 0,
    "Semester Wise Scores [5.51 - 6.00]": 0,
    "Semester Wise Scores [5.01 - 5.50]": 0,
    "Semester Wise Scores [4.50 - 5.00]": 0,
}

In [52]:
FIRST_SEM = "1-1 SGPA"

for column, values in sgpa_data.iteritems():  # type: ignore
    # Access each column as a pandas Series
    for index, sems in values.iteritems():
        # Access each value within the column
        if FIRST_SEM in str(sems):
            mean_sgpa = sgpa_map[column]
            first_sem_sgpa_count[mean_sgpa] += 1
            first_sem_sgpa_plot_data[column] += 1


iteritems is deprecated and will be removed in a future version. Use .items instead.


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [53]:
sum(first_sem_sgpa_count[key] for key in first_sem_sgpa_count)

103

In [54]:
first_sem_sgpa_plot_data

{'Semester Wise Scores [9.51 - 10.00]': 7,
 'Semester Wise Scores [9.01 - 9.50]': 14,
 'Semester Wise Scores [8.51 - 9.00]': 18,
 'Semester Wise Scores [8.01 - 8.50]': 17,
 'Semester Wise Scores [7.51 - 8.00]': 21,
 'Semester Wise Scores [7.01 - 7.50]': 12,
 'Semester Wise Scores [6.51 - 7.00]': 8,
 'Semester Wise Scores [6.01 - 6.50]': 3,
 'Semester Wise Scores [5.51 - 6.00]': 2,
 'Semester Wise Scores [5.01 - 5.50]': 0,
 'Semester Wise Scores [4.50 - 5.00]': 1}

In [55]:
first_sem_sgpa_plot_data_df = pd.DataFrame(
    first_sem_sgpa_plot_data.items(), columns=["Score", "Count"]
)

- Our labels currently are of the form `Semester Wise Scores [8.01 - 8.50]`
- We'll remove `Semester Wise Scores [` and the closing `]` so our labels look cleaner


In [56]:
first_sem_sgpa_plot_data_df["Score"] = first_sem_sgpa_plot_data_df["Score"].str.replace(
    "Semester Wise Scores \\[", "", regex=True
)
first_sem_sgpa_plot_data_df["Score"] = first_sem_sgpa_plot_data_df["Score"].str.replace(
    "\\]", "", regex=True
)

Before the transformation, our labels looked like: `Semester Wise Scores [8.01 - 8.50]` <br>
After the transformation, our labels look like: `8.01 - 8.50`

In [57]:
first_sem_sgpa_plot_data_df = first_sem_sgpa_plot_data_df[::-1]

In [58]:
first_sem_sgpa_plot_data_df

Unnamed: 0,Score,Count
10,4.50 - 5.00,1
9,5.01 - 5.50,0
8,5.51 - 6.00,2
7,6.01 - 6.50,3
6,6.51 - 7.00,8
5,7.01 - 7.50,12
4,7.51 - 8.00,21
3,8.01 - 8.50,17
2,8.51 - 9.00,18
1,9.01 - 9.50,14


#### Plotting 1st Sem SGPA Distribution


In [59]:
first_sem_total_values = sum(
    first_sem_sgpa_plot_data[key] for key in first_sem_sgpa_plot_data
)

first_sem_sgpa_plot = px.bar(
    first_sem_sgpa_plot_data_df,
    x=first_sem_sgpa_plot_data_df["Score"],
    y=first_sem_sgpa_plot_data_df["Count"],
    title=f"First Sem SGPA Distribution | {first_sem_total_values} students",
    color=first_sem_sgpa_plot_data_df["Score"],
    color_discrete_sequence=[
        px.colors.sample_colorscale("burg", v)[0]
        for v in (
            first_sem_sgpa_plot_data_df["Count"]
            / first_sem_sgpa_plot_data_df["Count"].max()
        ).tolist()
    ],
    hover_data=dict(Score=False),
)

first_sem_sgpa_plot.update_layout(
    xaxis_title="SGPA Values",
    yaxis_title="Number of people in the range",
    paper_bgcolor="rgb(255,255,255)",
    plot_bgcolor="rgb(255,255,255)",
)

first_sem_sgpa_plot.show()

In [60]:
"""Syntax:
l.extend([x for i in range(100)])
"""

first_sem_sgpa_hist_data = []

for sgpa, n in first_sem_sgpa_count.items():
    first_sem_sgpa_hist_data.extend([sgpa for i in range(n)])

In [65]:
"""Syntax:
fig = ff.create_distplot(hist_data, group_labels)
fig.show()
"""

first_sem_total_values = sum(
    first_sem_sgpa_plot_data[key] for key in first_sem_sgpa_plot_data
)

first_sem_sgpa_dist_plot = ff.create_distplot(
    [first_sem_sgpa_hist_data],
    group_labels=["SGPA Distribution"],
    curve_type="normal",
)

first_sem_sgpa_dist_plot.update_layout(
    title_text=f"First Sem SGPA Distribution | {first_sem_total_values} students",
    xaxis_title="SGPA Values",
    yaxis_title="Number of people in the range",
)

first_sem_sgpa_dist_plot.show()

### Average First Semester SGPA

- From the above curves, we can see that the average SGPA for the first semester lies somewhere around 8
- Since we have taken the input in the form of ranges, we will use the midpoint of the range to perform the calculations
- This may be off from the true value, but it will give us a good rough estimate for the same


In [62]:
first_sem_sgpa_count

defaultdict(int,
            {9.75: 7,
             9.25: 14,
             8.75: 18,
             8.25: 17,
             7.75: 21,
             7.25: 12,
             6.75: 8,
             6.25: 3,
             5.75: 2,
             4.75: 1})

In [63]:
total = 0
student_count = 0
weighted_mean_first_sem_sgpa = 0

for sgpa, n in first_sem_sgpa_count.items():
    total += sgpa * n
    student_count += n

weighted_mean_first_sem_sgpa = total / student_count
print(weighted_mean_first_sem_sgpa)

8.099514563106796


From the above calculations, we can see that the average SGPA for the first semester is approximately $8.01$ (rounded off to 2 decimal places)


---


### Saving Plots


In [64]:
"""Syntax:
plt.write_html("plots/plt_name.html")
"""

score_interval_bitsat_390.write_html("plots/score_interval_bitsat_390.html")
score_interval_bitsat_450.write_html("plots/score_interval_bitsat_450.html")

score_interval_cgpa_390.write_html("plots/score_interval_cgpa_390.html")
score_interval_cgpa_450.write_html("plots/score_interval_cgpa_450.html")

score_interval_dropped_390.write_html("plots/score_interval_dropped_390.html")
score_interval_dropped_450.write_html("plots/score_interval_dropped_450.html")

bitsat_current_cgpa_plot.write_html("plots/bitsat_current_cgpa_plot.html")

first_sem_sgpa_plot.write_html("plots/first_sem_sgpa_plot.html")
first_sem_sgpa_dist_plot.write_html("plots/first_sem_sgpa_dist_plot.html")