In [220]:
import pandas as pd
from QQUnivariate import QQUnivariate


In [221]:
# Although the T-test is widely used, it has certain limitations:
#It assumes that the data are normally distributed, so it may not be appropriate for non-normally distributed data unless sample sizes are large enough (Central Limit Theorem).
#It assumes equal variances between the groups (for independent T-tests). If variances are unequal, Welch’s T-test may be more appropriate.
#The T-test only tests the difference in means and doesn’t capture other important aspects of the data, like variance or distribution shape.


In [225]:
dataset=pd.read_csv("PrePlacement.csv")

In [223]:
dataset

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation,status
0,1.0,67.00,91.00,58.00,55.0,58.80,270000.000000,M,Others,Others,Commerce,Sci&Tech,No,Mkt&HR,Placed
1,2.0,79.33,78.33,77.48,86.5,66.28,200000.000000,M,Central,Others,Science,Sci&Tech,Yes,Mkt&Fin,Placed
2,3.0,65.00,68.00,64.00,75.0,57.80,250000.000000,M,Central,Central,Arts,Comm&Mgmt,No,Mkt&Fin,Placed
3,4.0,56.00,52.00,52.00,66.0,59.43,288655.405405,M,Central,Central,Science,Sci&Tech,No,Mkt&HR,Not Placed
4,5.0,85.80,73.60,73.30,96.8,55.50,425000.000000,M,Central,Central,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211.0,80.60,82.00,77.60,91.0,74.49,400000.000000,M,Others,Others,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
211,212.0,58.00,60.00,72.00,74.0,53.62,275000.000000,M,Others,Others,Science,Sci&Tech,No,Mkt&Fin,Placed
212,213.0,67.00,67.00,73.00,59.0,69.72,295000.000000,M,Others,Others,Commerce,Comm&Mgmt,Yes,Mkt&Fin,Placed
213,214.0,74.00,66.00,58.00,70.0,60.23,204000.000000,F,Others,Others,Commerce,Comm&Mgmt,No,Mkt&HR,Placed


# Unpaired t-Test  Independent  hsc_s =  Commerce/Science

In [224]:
dataset=dataset.dropna()
Commerce_mba_p = dataset[dataset['hsc_s']=='Commerce'] 
Science_mba_p = dataset[dataset['hsc_s']== 'Science']


In [196]:
Commerce_mba_p.loc[:,['hsc_s','mba_p']]

Unnamed: 0,hsc_s,mba_p
0,Commerce,58.80
4,Commerce,55.50
6,Commerce,53.29
8,Commerce,61.29
9,Commerce,52.21
...,...,...
207,Commerce,71.55
209,Commerce,56.49
210,Commerce,74.49
212,Commerce,69.72


In [197]:
Science_mba_p.loc[:,['hsc_s','mba_p']]

Unnamed: 0,hsc_s,mba_p
1,Science,66.28
3,Science,59.43
5,Science,51.58
7,Science,62.14
12,Science,65.04
...,...,...
202,Science,60.11
206,Science,53.39
208,Science,62.92
211,Science,53.62


# Unpaired "ttest_ind"  Independent hsc_s =  Commerce/Science

In [198]:
from scipy.stats import ttest_ind
dataset=dataset.dropna()
Commerce_mba_p = dataset[dataset['hsc_s']=='Commerce']['mba_p']
Science_mba_p = dataset[dataset['hsc_s']== 'Science']['mba_p']
ttest_ind(Others_ssc_b, Central_ssc_b)

Ttest_indResult(statistic=0.655696345275805, pvalue=0.5127271255938062)

# ttest_ind result interpretation

In [104]:
#statistic:
# A larger absolute value of the t-statistic indicates a greater difference between the sample means relative to the variability in the data.
#The sign of the t-statistic indicates the direction of the difference between the sample means. 
#A positive t-statistic suggests that the sample means are greater than the population means,indicating a positive difference,
#A negative t-statistic suggests that the sample means are less than the population meansindicating a negative difference. 
#  This sign is crucial for interpreting the results # of a t-test, as it helps determine whether the observed difference is statistically significant.

# p-value:
# If p-value ≤ α (commonly 0.05), reject the null hypothesis. This suggests a significant difference between the two means.
# If p-value > α, fail to reject the null hypothesis. This suggests no significant difference between the two means.


In [105]:
#statistic=1.9001627265152634 - Low absolute value indicates a positive difference
#pvalue=0.05876239427122642 - Indicates mean is equal to expected value 0.05.

# Hypothesis

In [None]:
#Hypothesis : A premise or claim that we want to test
#Null Hypothesis-  Ho (default  hypothesis or  currently accepted hypothesis or currently accepted value for a parameter)
#Alternative Hypothesis- Ha-  Also called Research hypothesis, Involves a claim to be tested

In [None]:
#Statment: Is the  Commerce and Science (hsc_s)  mba_p is not < 0.05
#NULL hypothesis (Ho) Current Value0.058- No Statistically Significant difference
#NULL Hypothesis not rejected

# Paired t-Test  Dependent hsc_s =  Commerce  - mba_p/etest_p

In [199]:
P_Commerce_mba_p = dataset[dataset['hsc_s']=='Commerce'] 
P_Commerce_etest_p = dataset[dataset['hsc_s']=='Commerce'] 

In [200]:
P_Commerce_mba_p.loc[:,['hsc_s','mba_p']]

Unnamed: 0,hsc_s,mba_p
0,Commerce,58.80
4,Commerce,55.50
6,Commerce,53.29
8,Commerce,61.29
9,Commerce,52.21
...,...,...
207,Commerce,71.55
209,Commerce,56.49
210,Commerce,74.49
212,Commerce,69.72


In [201]:
P_Commerce_etest_p.loc[:,['hsc_s','etest_p']]

Unnamed: 0,hsc_s,etest_p
0,Commerce,55.00
4,Commerce,96.80
6,Commerce,74.28
8,Commerce,91.34
9,Commerce,54.00
...,...,...
207,Commerce,88.56
209,Commerce,67.00
210,Commerce,91.00
212,Commerce,59.00


# Paired  "test_rel" Result - Dependent 

In [202]:
#The ttest_rel function in SciPy is used to perform a paired (dependent) t-test. This test compares the means of two related
#    samples (e.g., before-and-after measurements on the same subjects) to determine if there is a statistically significant 
#    difference between them.
# Result Values
#  statistic: A measure of the difference between the sample means relative to the variability in the data.
#  pvalue: The probability of observing the data (or something more extreme) under the null value
#

In [203]:
from scipy.stats import ttest_rel
dataset=dataset.dropna()
P_Commerce_mba_p = dataset[dataset['hsc_s']=='Commerce']['mba_p']
P_Commerce_etest_p = dataset[dataset['hsc_s']=='Commerce']['etest_p']
ttest_rel(P_Commerce_mba_p,P_Commerce_etest_p )

Ttest_relResult(statistic=-7.868552092606869, pvalue=2.462926468454984e-12)

# ttest_rel result interpretation

In [204]:
#statistic:
# A larger absolute value of the t-statistic indicates a greater difference between the sample means relative to the variability in the data.
#The sign of the t-statistic indicates the direction of the difference between the sample means. 
#A positive t-statistic suggests that the sample means are greater than the population means,indicating a positive difference,
#A negative t-statistic suggests that the sample means are less than the population meansindicating a negative difference. 
#  This sign is crucial for interpreting the results # of a t-test, as it helps determine whether the observed difference is statistically significant.

# p-value:
# If p-value ≤ α (commonly 0.05), reject the null hypothesis. This suggests a significant difference between the two means.
# If p-value > α, fail to reject the null hypothesis. This suggests no significant difference between the two means.



In [205]:
#statistic=-7.868552092606869 - Indicates a greater (negative) difference between the the sample means relative to the variability in the data.
#pvalue=2.462926468454984e-12 - Indicates  no significant difference between the two means.

# ANOVA (Analysis of Variance) - Hypothesis Testing

In [206]:
#ANOVA - statistical test used to examine differences among the means of three or more groups.
# Unlike a t-test, which only compares two groups, ANOVA can handle multiple groups in a single analysis, 
# making it an essential tool for experiments with more than two categories.

# One way ANOVA

In [None]:
# One-way ANOVA is used to compare the means of three or more groups based on a single independent variable. 
# It shows if there is a significant difference among the group means.

# Example: A researcher tests three different fertilizers on plant growth. 
# The independent variable is the fertilizer type, and the dependent variable is the growth rate of plants.

In [125]:
import scipy.stats as stats
stats.f_oneway(dataset['ssc_p'], dataset['hsc_p'], dataset['degree_p'])

F_onewayResult(statistic=0.6719700864663097, pvalue=0.5110602818995302)

In [None]:
# Hypothesis Result
#In this One way ANOVA pvalue (0.511)is greater than 0.05 reject Ho  Null Hypothesis

# Two way ANOVA

In [None]:
#Two-way ANOVA is used when there are two independent variables,
# allowing researchers to explore individualand interactive effects.

#Example: A researcher examines how different teaching methods (lecture vs. discussion) and class times (morning vs. 
# afternoon) impact student performance. 
# To determined How  independent variables like teaching methods and class times affect the dependent variables


In [153]:
!pip install statsmodels



In [207]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [227]:
dataset=pd.read_csv("PrePlacement.csv")

In [228]:
dataset=dataset.dropna()

In [235]:
dataset

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation,status
0,1.0,67.00,91.00,58.00,55.0,58.80,270000.000000,M,Others,Others,Commerce,Sci&Tech,No,Mkt&HR,Placed
1,2.0,79.33,78.33,77.48,86.5,66.28,200000.000000,M,Central,Others,Science,Sci&Tech,Yes,Mkt&Fin,Placed
2,3.0,65.00,68.00,64.00,75.0,57.80,250000.000000,M,Central,Central,Arts,Comm&Mgmt,No,Mkt&Fin,Placed
3,4.0,56.00,52.00,52.00,66.0,59.43,288655.405405,M,Central,Central,Science,Sci&Tech,No,Mkt&HR,Not Placed
4,5.0,85.80,73.60,73.30,96.8,55.50,425000.000000,M,Central,Central,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211.0,80.60,82.00,77.60,91.0,74.49,400000.000000,M,Others,Others,Commerce,Comm&Mgmt,No,Mkt&Fin,Placed
211,212.0,58.00,60.00,72.00,74.0,53.62,275000.000000,M,Others,Others,Science,Sci&Tech,No,Mkt&Fin,Placed
212,213.0,67.00,67.00,73.00,59.0,69.72,295000.000000,M,Others,Others,Commerce,Comm&Mgmt,Yes,Mkt&Fin,Placed
213,214.0,74.00,66.00,58.00,70.0,60.23,204000.000000,F,Others,Others,Commerce,Comm&Mgmt,No,Mkt&HR,Placed


In [255]:
model = ols('salary ~ C(degree_p) + C(ssc_p) + C(degree_p):C(ssc_p)', data=dataset).fit()


In [256]:
annova_result = anova_lm(model,type=2)

ValueError: shapes (4,9167) and (215,) not aligned: 9167 (dim 1) != 215 (dim 0)

In [254]:
annova_result

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(degree_p),88.0,803919000000.0,9135443000.0,2.348429,0.000214
C(ssc_p),102.0,426461200000.0,4180993000.0,1.074799,0.381811
Residual,64.0,248961500000.0,3890024000.0,,
