In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import t


## The Student News Service at Clear Mountain State University (CMSU) has decided to gather data about the undergraduate students that attend CMSU. CMSU creates and distributes a survey of 14 questions and receives responses from 62 undergraduates (stored in the Survey.csv file).

In [2]:
df2 = pd.read_csv('Survey.csv')

In [3]:
df2.head()

Unnamed: 0,ID,Gender,Age,Class,Major,Grad Intention,GPA,Employment,Salary,Social Networking,Satisfaction,Spending,Computer,Text Messages
0,1,Female,20,Junior,Other,Yes,2.9,Full-Time,50.0,1,3,350,Laptop,200
1,2,Male,23,Senior,Management,Yes,3.6,Part-Time,25.0,1,4,360,Laptop,50
2,3,Male,21,Junior,Other,Yes,2.5,Part-Time,45.0,2,4,600,Laptop,200
3,4,Male,21,Junior,CIS,Yes,2.5,Full-Time,40.0,4,6,600,Laptop,250
4,5,Male,23,Senior,Other,Undecided,2.8,Unemployed,40.0,2,4,500,Laptop,100


In [4]:
## checking for missing values
df2.isnull().sum().sum()

0

## Constructing various contingency tables for this data


## Gender and Major

In [5]:
df2_crosstab_gender_major = pd.crosstab(df2['Gender'],df2['Major'])
df2_crosstab_gender_major

Major,Accounting,CIS,Economics/Finance,International Business,Management,Other,Retailing/Marketing,Undecided
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,3,3,7,4,4,3,9,0
Male,4,1,4,2,6,4,5,3


## Gender and Grad Intention

In [6]:
df2_crosstab_gender_grad_intention = pd.crosstab(df2['Gender'],df2['Grad Intention'])
df2_crosstab_gender_grad_intention

Grad Intention,No,Undecided,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,9,13,11
Male,3,9,17


## Gender and Employment

In [7]:
df2_crosstab_gender_employement = pd.crosstab(df2['Gender'],df2['Employment'])
df2_crosstab_gender_employement

Employment,Full-Time,Part-Time,Unemployed
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,3,24,6
Male,7,19,3


## Gender and Computer

In [8]:
df2_crosstab_gender_computer = pd.crosstab(df2['Gender'],df2['Computer'])
df2_crosstab_gender_computer

Computer,Desktop,Laptop,Tablet
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,2,29,2
Male,3,26,0


### Checking for the probability that a randomly selected CMSU student will be male and female.

In [9]:
df2.isnull().sum().sum()

0

In [10]:
number_of_male = df2.Gender.value_counts()['Male']
number_of_female = df2.Gender.value_counts()['Female']
total_students = df2.Gender.count()


In [11]:
number_of_male, number_of_female, total_students

(29, 33, 62)

In [12]:
Probability_of_randomly_selected_student_being_male = number_of_male/total_students
Probability_of_randomly_selected_student_being_male

0.46774193548387094

In [13]:
Probability_of_randomly_selected_student_being_female = number_of_female/total_students
Probability_of_randomly_selected_student_being_female

0.532258064516129

### The probability of a randomly selceted student being Male is 46.77%
### The probability of a randomly selceted student being Female is 53.22%



### Conditional probability of different majors among the male students in CMSU.
### Conditional probability of different majors among the female students of CMSU.


In [14]:
cp_major_male = (df2_crosstab_gender_major[1:2]/number_of_male)*100
cp_major_male

Major,Accounting,CIS,Economics/Finance,International Business,Management,Other,Retailing/Marketing,Undecided
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Male,13.793103,3.448276,13.793103,6.896552,20.689655,13.793103,17.241379,10.344828


In [15]:
cp_major_felame = (df2_crosstab_gender_major[0:1]/number_of_female)*100
cp_major_felame

Major,Accounting,CIS,Economics/Finance,International Business,Management,Other,Retailing/Marketing,Undecided
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,9.090909,9.090909,21.212121,12.121212,12.121212,9.090909,27.272727,0.0



### The above table cp_major_male gives the conditional probability(%) of different majors among the male students in CMSU
### The above table cp_major_female gives the conditional probability(%) of different majors among the female students in CMSU

### Finding the conditional probability of intent to graduate, given that the student is a male.
### Finding the conditional probability of intent to graduate, given that the student is a female.

In [16]:
cp_grad_int_male = (df2_crosstab_gender_grad_intention[1:2]/number_of_male)*100
cp_grad_int_male

Grad Intention,No,Undecided,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,10.344828,31.034483,58.62069


In [17]:
cp_grad_int_female = (df2_crosstab_gender_grad_intention[0:1]/number_of_female)*100
cp_grad_int_female

Grad Intention,No,Undecided,Yes
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,27.272727,39.393939,33.333333


### The above table cp_grad_int_male gives the conditional probability(%) of intent to graduate, given that the student is a male
### The above table cp_grad_int_female gives the conditional probability(%) of intent to graduate, given that the student is a female


### Finding the conditional probability of employment status for the male students as well as for the female students.

In [18]:
cp_emp_stat_male = (df2_crosstab_gender_employement[1:2]/number_of_male)*100
cp_emp_stat_male

Employment,Full-Time,Part-Time,Unemployed
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,24.137931,65.517241,10.344828


In [19]:
cp_emp_stat_female = (df2_crosstab_gender_employement[0:1]/number_of_female)*100
cp_emp_stat_female

Employment,Full-Time,Part-Time,Unemployed
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,9.090909,72.727273,18.181818


### The above table cp_emp_stat_male gives the conditional probability(%) of employment status for the male students
### The above table cp_emp_stat_female gives the conditional probability(%) of employment status for the female students

### Finding the conditional probability of laptop preference among the male students as well as among the female students.
 

In [20]:
cp_laptop_pref_male = (df2_crosstab_gender_computer[1:2]['Laptop']/number_of_male)*100
cp_laptop_pref_male

Gender
Male    89.655172
Name: Laptop, dtype: float64

In [21]:
cp_laptop_pref_female = (df2_crosstab_gender_computer[0:1]['Laptop']/number_of_female)*100
cp_laptop_pref_female

Gender
Female    87.878788
Name: Laptop, dtype: float64

### The conditional probability of laptop preference among the male students is 89.655%
### The conditional probability of laptop preference among the female students is 87.878%

## Based on the above probabilities, checking if the column variable in each case is independent of Gender

In [22]:
stats.chi2_contingency(df2_crosstab_gender_major)

(7.084844866036089,
 0.42009968345511806,
 7,
 array([[3.72580645, 2.12903226, 5.85483871, 3.19354839, 5.32258065,
         3.72580645, 7.4516129 , 1.59677419],
        [3.27419355, 1.87096774, 5.14516129, 2.80645161, 4.67741935,
         3.27419355, 6.5483871 , 1.40322581]]))

In [23]:
stats.chi2_contingency(df2_crosstab_gender_grad_intention)

(4.774796781066374,
 0.09186837889149435,
 2,
 array([[ 6.38709677, 11.70967742, 14.90322581],
        [ 5.61290323, 10.29032258, 13.09677419]]))

In [24]:
stats.chi2_contingency(df2_crosstab_gender_employement)

(2.9355495613715337,
 0.2304376894892966,
 2,
 array([[ 5.32258065, 22.88709677,  4.79032258],
        [ 4.67741935, 20.11290323,  4.20967742]]))

In [25]:
stats.chi2_contingency(df2_crosstab_gender_computer)

(2.114372565783224,
 0.3474320117040881,
 2,
 array([[ 2.66129032, 29.27419355,  1.06451613],
        [ 2.33870968, 25.72580645,  0.93548387]]))

## We can apply the chi squared test of independance on the contingency tables we have

## Hypothesis:
### Null: Column variable is independent of Gender
### Alternate: Column variable is dependent on Gender

## From the above chi-squared tests, we have the following P values:
#### For,
#### Gender and Major : P value = 42%
#### Gender and Grad Intention : P value = 9.18%
#### Gender and Employement : P value = 23.04%
#### Gender and Computer pref : P value = 34.74%

## 1. At Alpha = 5%

### All the P values are higher than alpha.
### Hence,The column variables have no dependancy on the gender. That is, the variables are independent of the Gender.

## 2. At Alpha = 1%

### All the P values are higher than alpha.
### Hence,The column variables have no dependancy on the gender. That is, the variables are independent of the Gender.

## 3. At Alpha = 10%

### From the P values, we can say the variable Grad Intention is dependant on the Gender
### The rest of the column variables have no dependancy on the gender.


## 2Note that there are three numerical (continuous) variables in the data set, Salary, Spending and Text Messages. For each of them comment whether they follow a normal distribution. Summarizing the conclusions.
### [symmetric histogram does not necessarily mean that the underlying distribution is symmetric]

In [26]:
stats.normaltest(df2['Salary'])

NormaltestResult(statistic=3.84580947969415, pvalue=0.14618172494628334)

In [27]:
stats.normaltest(df2['Spending'])

NormaltestResult(statistic=30.49562450314631, pvalue=2.387587398454289e-07)

In [28]:
stats.normaltest(df2['Text Messages'])

NormaltestResult(statistic=16.34755294390911, pvalue=0.0002819512224692029)

#### Assuming Alpha = 5%
#### Null hypothesis : Variables follow normal distribution
#### Alternate hypothesis : Variable do not follow normal distribution

#### Using the normaltest function from scipy.stats, we can get the P value
#### From the P values, we can say that:

### A.
### For the variable Salary, we fail to reject the null hypothesis as the P value (14.6%) is greater than alpha(5%). Hence, the variable Salary follows a normal distribution.

### B.
### For the variable Spending, we reject the null hypothesis as there is enough evidence to say that the variable Spending doesn't follow normal distribution. The P value (0.0000238%) is less than alpha(5%). Hence, the variable Spending doesn not follow a normal distribution.

### C.
### For the variable Text Messages, we reject the null hypothesis as there is enough evidence to say that the variable Spending doesn't follow normal distribution. The P value (0.028%) is less than alpha(5%). Hence, the variable Spending doesn not follow a normal distribution.