In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# Job Placement Dataset

jobdata = pd.read_csv('job.csv')
jobdata.info()
jobdata.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    object 
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(1), object(8)
memory usage: 25.3+ KB


Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
5,6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.0,Mkt&Fin,51.58,Not Placed,
6,7,F,46.0,Others,49.2,Others,Commerce,79.0,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed,
7,8,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0
8,9,M,73.0,Central,79.0,Central,Commerce,72.0,Comm&Mgmt,No,91.34,Mkt&Fin,61.29,Placed,231000.0
9,10,M,58.0,Central,70.0,Central,Commerce,61.0,Comm&Mgmt,No,54.0,Mkt&Fin,52.21,Not Placed,


### QUESTION: Among students who are placed in a job, how does their MBA specialisation influence their normalized salary?

In [3]:
# filtering to only include the ones that are placed in a job
placed_students = jobdata[jobdata['status'] == 'Placed']

In [4]:
# there are 95 Mkt&Fin and 53 Mkt&HR
placed_students['specialisation'].value_counts()
# converting the column 'specialisation' to a categorical type
placed_students['specialisation'] = placed_students['specialisation'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  placed_students['specialisation'] = placed_students['specialisation'].astype('category')


In [5]:
# one-hot encoding the specialisation column
placed_students = pd.get_dummies(placed_students, columns=['specialisation'])

In [6]:
# normalizing the salary column

scaler = MinMaxScaler()
placed_students['salary_normalized'] = scaler.fit_transform(placed_students[['salary']])

In [8]:
# I first define the Independent variable X, which is the specialisation and Target variable y, which is the salary_normalized
X = placed_students.drop(columns=['salary', 'salary_normalized', 'status'])  # Removing target and unneeded columns
y = placed_students['salary_normalized']  # setting target

In [9]:
# Calculating the prevalence of the target variable
prevalence = placed_students['salary_normalized'].mean()
print(f"Prevalence of salary_normalized: {prevalence}")

Prevalence of salary_normalized: 0.11980460189919646


In [10]:
# I split the data into Train (70%), Tune (15%), Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_tune, X_test, y_tune, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training Set: {X_train.shape}, {y_train.shape}")
print(f"Tuning Set: {X_tune.shape}, {y_tune.shape}")
print(f"Testing Set: {X_test.shape}, {y_test.shape}")

Training Set: (103, 14), (103,)
Tuning Set: (22, 14), (22,)
Testing Set: (23, 14), (23,)


In [11]:
# grouping the data by specialization and calculating the mean and median salary
# true means they specialised in Mkt&Fin and false means they specialised in Mkt&HR
salary_comparison = placed_students.groupby(['specialisation_Mkt&Fin', 'specialisation_Mkt&HR'])['salary_normalized'].agg(['mean', 'median'])

print(salary_comparison)

                                                  mean    median
specialisation_Mkt&Fin specialisation_Mkt&HR                    
False                  True                   0.095105  0.074324
True                   False                  0.133585  0.094595


### Function Creation

In [12]:
def process_placed_students(jobdata):
    # 1: Filter only placed students
    placed_students = jobdata[jobdata['status'] == 'Placed']
    
    # 2: Convert 'specialisation' to a categorical type
    placed_students['specialisation'] = placed_students['specialisation'].astype('category')
    
    # 3: One-hot encode the 'specialisation' column
    placed_students = pd.get_dummies(placed_students, columns=['specialisation'])
    
    # 4: Normalize the salary column
    scaler = MinMaxScaler()
    placed_students['salary_normalized'] = scaler.fit_transform(placed_students[['salary']])
    
    # 5: Define Independent (X) and Target (y) variables
    X = placed_students.drop(columns=['salary', 'salary_normalized', 'status'])
    y = placed_students['salary_normalized']

    # 6: Calculating the prevalence of the target variable
    prevalence = placed_students['salary_normalized'].mean()
    print(f"Prevalence of salary_normalized: {prevalence}")    
    
    # 7: Split data into Train (70%), Tune (15%), Test (15%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_tune, X_test, y_tune, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    print(f"Training Set: {X_train.shape}, {y_train.shape}")
    print(f"Tuning Set: {X_tune.shape}, {y_tune.shape}")
    print(f"Testing Set: {X_test.shape}, {y_test.shape}")
    
    # 8: Compare salary based on specialization
    salary_comparison = placed_students.groupby(['specialisation_Mkt&Fin', 'specialisation_Mkt&HR'])['salary_normalized'].agg(['mean', 'median'])
    
    return salary_comparison

process_placed_students(jobdata)

Prevalence of salary_normalized: 0.11980460189919646
Training Set: (103, 14), (103,)
Tuning Set: (22, 14), (22,)
Testing Set: (23, 14), (23,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  placed_students['specialisation'] = placed_students['specialisation'].astype('category')


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median
specialisation_Mkt&Fin,specialisation_Mkt&HR,Unnamed: 2_level_1,Unnamed: 3_level_1
False,True,0.095105,0.074324
True,False,0.133585,0.094595


### Step Three: Results Analysis

The dataset effectively addresses the impact of specialization on salary, but key concerns include an imbalance in specialization groups (95 vs 53), potential oversimplification by excluding other salary-influencing factors such as degree type and experience, and missing salary values that could bias results. While one-hot encoding and normalization ensure clean comparisons, incorporating additional predictors and handling missing data could enhance accuracy. Despite these limitations, the dataset provides a solid foundation for analyzing specialization-based salary trends.

## College Completion Dataset

In [60]:
collegedata = pd.read_csv('college.csv')
collegedata.info()
collegedata.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 63 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   index                                 3798 non-null   int64  
 1   unitid                                3798 non-null   int64  
 2   chronname                             3798 non-null   object 
 3   city                                  3798 non-null   object 
 4   state                                 3798 non-null   object 
 5   level                                 3798 non-null   object 
 6   control                               3798 non-null   object 
 7   basic                                 3798 non-null   object 
 8   hbcu                                  94 non-null     object 
 9   flagship                              50 non-null     object 
 10  long_x                                3798 non-null   float64
 11  lat_y            

Unnamed: 0,index,unitid,chronname,city,state,level,control,basic,hbcu,flagship,...,vsa_grad_after6_transfer,vsa_grad_elsewhere_after6_transfer,vsa_enroll_after6_transfer,vsa_enroll_elsewhere_after6_transfer,similar,state_sector_ct,carnegie_ct,counted_pct,nicknames,cohort_size
0,0,100654,Alabama A&M University,Normal,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,...,36.4,5.6,17.2,11.1,232937|100724|405997|113607|139533|144005|2285...,13,386,99.7|07,,882.0
1,1,100663,University of Alabama at Birmingham,Birmingham,Alabama,4-year,Public,Research Universities--very high research acti...,,,...,,,,,196060|180461|201885|145600|209542|236939|1268...,13,106,56.0|07,UAB,1376.0
2,2,100690,Amridge University,Montgomery,Alabama,4-year,Private not-for-profit,Baccalaureate Colleges--Arts & Sciences,,,...,,,,,217925|441511|205124|247825|197647|221856|1353...,16,252,100.0|07,,3.0
3,3,100706,University of Alabama at Huntsville,Huntsville,Alabama,4-year,Public,Research Universities--very high research acti...,,,...,0.0,0.0,0.0,0.0,232186|133881|196103|196413|207388|171128|1900...,13,106,43.1|07,UAH,759.0
4,4,100724,Alabama State University,Montgomery,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,X,,...,,,,,100654|232937|242617|243197|144005|241739|2354...,13,386,88.0|07,ASU,1351.0
5,5,100751,University of Alabama at Tuscaloosa,Tuscaloosa,Alabama,4-year,Public,Research Universities--high research activity,,X,...,61.5,8.5,6.3,5.5,183044|209551|240727|207388|178402|185828|2380...,13,96,75.9|07,,4438.0
6,6,100760,Central Alabama Community College,Alexander City,Alabama,2-year,Public,Associates--Public Rural-serving Medium,,,...,,,,,217712|227225|198491|175935|107460|206923|2407...,25,289,66.7|10,,594.0
7,7,100830,Auburn University at Montgomery,Montgomery,Alabama,4-year,Public,Masters Colleges and Universities--larger prog...,,,...,,,,,229814|210429|160038|176965|169798|207263|1023...,13,386,43.8|07,AUM,536.0
8,8,100858,Auburn University,Auburn University,Alabama,4-year,Public,Research Universities--high research activity,,,...,72.2,9.7,2.0,4.6,223232|204024|163268|165334|132903|171128|1961...,13,96,75.5|07,,4165.0
9,9,100937,Birmingham-Southern College,Birmingham,Alabama,4-year,Private not-for-profit,Baccalaureate Colleges--Arts & Sciences,,,...,,,,,174747|153144|143084|152390|168591|199111|1406...,16,252,88.9|07,BSC,449.0


### QUESTION: Are public universities more affordable than private-for-profit and private-for-non-profit universities based on financial aid and endowments?

In [54]:
# First checking for missing values
print(collegedata[['aid_value', 'endow_value', 'student_count']].isnull().sum())

# Filling missing values with median for aid and endowment per student
collegedata['aid_value'].fillna(collegedata['aid_value'].median(), inplace=True)
collegedata['endow_value'].fillna(collegedata['endow_value'].median(), inplace=True)
collegedata['student_count'].fillna(collegedata['student_count'].median(), inplace=True)


aid_value           1
endow_value      1475
student_count       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collegedata['aid_value'].fillna(collegedata['aid_value'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collegedata['endow_value'].fillna(collegedata['endow_value'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will n

In [55]:
# Converting the column 'control' into a categorical type
collegedata['control'] = collegedata['control'].astype('category')

# One-hot encoding the column 'control' into three binary columns
collegedata = pd.get_dummies(collegedata, columns=['control'])

In [56]:
# Computing the endowment per student
collegedata['endowment_per_student'] = collegedata['endow_value'] / collegedata['student_count']

# Dropping unnecessary columns
collegedata = collegedata.drop(columns=['index', 'unitid', 'chronname', 'city', 'state', 'site', 'long_x', 'lat_y'])

In [57]:
# Normalizing the continuous variables (aid_value and endowment_per_student)

scaler = MinMaxScaler()
collegedata[['aid_value', 'endowment_per_student']] = scaler.fit_transform(collegedata[['aid_value', 'endowment_per_student']])

In [58]:
# Defining the Independent (X) and Target (y) Variables
X = collegedata.drop(columns=['endowment_per_student'])
y = collegedata['endowment_per_student']

# Spliting the dataset into Train (70%), Tune (15%), and Test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_tune, X_test, y_tune, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training Set: {X_train.shape}, {y_train.shape}")
print(f"Tuning Set: {X_tune.shape}, {y_tune.shape}")
print(f"Testing Set: {X_test.shape}, {y_test.shape}")


Training Set: (2658, 57), (2658,)
Tuning Set: (570, 57), (570,)
Testing Set: (570, 57), (570,)


In [59]:
# Computing the summary statistics of endowment per student
print(collegedata.groupby(['control_Private for-profit', 'control_Private not-for-profit', 'control_Public'])['endowment_per_student'].agg(['mean', 'median']))

                                                                              mean  \
control_Private for-profit control_Private not-for-profit control_Public             
False                      False                          True            0.000471   
                           True                           False           0.009693   
True                       False                          False           0.002724   

                                                                            median  
control_Private for-profit control_Private not-for-profit control_Public            
False                      False                          True            0.000076  
                           True                           False           0.002097  
True                       False                          False           0.001948  


### Function Creation

In [61]:
def process_college_data(collegedata):
    # Filling missing values with the median
    collegedata['aid_value'].fillna(collegedata['aid_value'].median(), inplace=True)
    collegedata['endow_value'].fillna(collegedata['endow_value'].median(), inplace=True)
    collegedata['student_count'].fillna(collegedata['student_count'].median(), inplace=True)

    # Converting 'control' into a categorical type
    collegedata['control'] = collegedata['control'].astype('category')

    # One-hot encoding the 'control' column
    collegedata = pd.get_dummies(collegedata, columns=['control'])

    # Computing endowment per student
    collegedata['endowment_per_student'] = collegedata['endow_value'] / collegedata['student_count']

    # Dropping unnecessary columns
    collegedata = collegedata.drop(columns=['index', 'unitid', 'chronname', 'city', 'state', 'site', 'long_x', 'lat_y'])

    # Normalizing the continuous variables (aid_value and endowment_per_student)
    scaler = MinMaxScaler()
    collegedata[['aid_value', 'endowment_per_student']] = scaler.fit_transform(collegedata[['aid_value', 'endowment_per_student']])

    # Defining Independent (X) and Target (y) Variables
    X = collegedata.drop(columns=['endowment_per_student'])
    y = collegedata['endowment_per_student']

    # Splitting the dataset into Train (70%), Tune (15%), and Test (15%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_tune, X_test, y_tune, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print(f"Training Set: {X_train.shape}, {y_train.shape}")
    print(f"Tuning Set: {X_tune.shape}, {y_tune.shape}")
    print(f"Testing Set: {X_test.shape}, {y_test.shape}")

    # Computing the summary statistics of endowment per student
    summary_stats = collegedata.groupby(['control_Private for-profit', 'control_Private not-for-profit', 'control_Public'])['endowment_per_student'].agg(['mean', 'median'])
    return summary_stats

process_college_data(collegedata)

Training Set: (2658, 57), (2658,)
Tuning Set: (570, 57), (570,)
Testing Set: (570, 57), (570,)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collegedata['aid_value'].fillna(collegedata['aid_value'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collegedata['endow_value'].fillna(collegedata['endow_value'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will n

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,median
control_Private for-profit,control_Private not-for-profit,control_Public,Unnamed: 3_level_1,Unnamed: 4_level_1
False,False,True,0.000471,7.6e-05
False,True,False,0.009693,0.002097
True,False,False,0.002724,0.001948


### Step Three: Results Analysis
The results suggest that private not-for-profit institutions have the highest endowment per student, followed by public institutions, and private for-profit institutions having the lowest. This aligns with expectations, as private not-for-profits typically have larger financial resources. The analysis effectively displays differences in financial aid and endowment resources which helps address affordability concerns between institution types. However, potential concerns include the fact that this analysis does not consider tuition costs, cost of living, or financial aid distribution, which are important for an accurate affordability assessment. Further refinement incorporating these factors could provide a more comprehensive insight.