# Thesis Results

In [5]:
# Import statements

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy import stats
from scipy.stats import spearmanr
from scipy.stats import chisquare
from scipy.stats import chi2_contingency

from pylab import rcParams
import seaborn as sb
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

## Analyze Likert scale questions using t test to find significant Likert factors

In [6]:
#ttest is used to see how different the two groups are (People who choose to stay vs People who choose to go home) 
ttest_likert = pd.read_excel("likert_stayorleave.xlsx", index_col = "Person")
ttest_likert
replacer = {'Much better at home': 1, 'Slightly better at home': 2,
            'Same': 3,
            'Slightly better in China': 4, 'Much better in China': 5}

# Select String Columns
cols = ttest_likert.columns[ttest_likert.dtypes == 'object']
# Replace Values in those Columns
ttest_likert[cols] = ttest_likert[cols].replace(replacer)
list_of_questions = ttest_likert.drop(columns = ['Do you plan to stay and work in China or go home after your graduation?']).columns.to_list()
ttest_stay = ttest_likert[ttest_likert["Do you plan to stay and work in China or go home after your graduation?"] == "Stay and work in China"]
ttest_leave = ttest_likert[ttest_likert["Do you plan to stay and work in China or go home after your graduation?"] == "Go home"]

In [7]:
# ttest for all questions in list_of_questions
ttest_pval = {}
ttest_val = {}
for question in list_of_questions:
    result = stats.ttest_ind(ttest_stay[question], ttest_leave[question])
    ttest_val[question] = result[0]
    ttest_pval[question] = result[1]

ttest_val = pd.Series(ttest_val)
ttest_pval = pd.Series(ttest_pval)
ttest_result=pd.concat([ttest_val,ttest_pval],axis=1)
ttest_result.columns = ['Values', "P_values"]
ttest_result = ttest_result#.sort_values(by = 'P_values')
ttest_p_value = ttest_result["P_values"].round(3)
ttest_p_value.to_frame()

Unnamed: 0,P_values
Access to healthcare facilities,0.875
Access to recreation facilities,0.532
Air quality,0.478
Educational opportunities for children,0.984
Gender equality,0.451
Job opportunities,0.003
Level of political stability,0.16
Level of public security,0.077
Opportunities to advance your career,0.021
Racial discrimination situation,0.36


In [8]:
#export to excel file
ttest_p_value.to_excel("ttest_p_value.xlsx")
ttest_result.to_excel("ttest_result.xlsx")

In [9]:
#if factors with the p_values < 6.2%, those factors are considered significant
ttest_importance = ttest_result[ttest_result["P_values"]<0.062]
ttest_importance

Unnamed: 0,Values,P_values
Job opportunities,3.018458,0.003432
Opportunities to advance your career,2.349682,0.02132
Starting salary after completing your studies,2.172769,0.032834


## Analyze demographic questions using Chi-square to find important demographic factors

In [10]:
chi_square = pd.read_excel("Not_likert_question_Chisquare.xlsx", index_col = "Person")

# change values of columns to numeric values
for col in chi_square.columns:
    le = LabelEncoder()
    chi_square[col] = le.fit_transform(chi_square[col])
    
#find corelation between x and y
from sklearn.feature_selection import chi2
x = chi_square.drop(columns = ['Do you plan to stay and work in China or go home after your graduation?'], axis =1)
y = chi_square['Do you plan to stay and work in China or go home after your graduation?']
chi_scores = chi2(x, y)
chi_scores_toframe = pd.DataFrame(chi2(x, y), index = ["Chi_value", "P_value"])
chi_scores_toframe.columns = ['What school are you enrolled at in Beihang?',
       'What degree level are you enrolled in?',
       'Which year of the program are you in?', 'What is your gender?',
       'What country do you regard as your home country?',
       'How long have you lived in China?',
       'How do you assess your current level of Chinese language?',
       'What type of visa do you hold to be in China?',
       'What is your primary source of funding for your education in Beihang?',
       'Have you signed a contract stating you must return to work in your home country after your graduation?',
       'Do you have any relatives living in China?',
       'Where do your parents live?',
       'Do you have relatives living in the country where your parents live, who can take care of your parents if they were to need assistance?',
       'Do you have a romantic partner?',
       'Do you have pressure to return home after graduation from your family?',
       'Before you moved to China, did you plan to go home or stay in China after graduation?',
       'Is your plan now different from your initial plan before moving to China?',
       'Overall as a graduate student here, how would you rank your experience?',
       'Do you have Chinese friends?']
chi_scores_toframe= chi_scores_toframe.T
chi_scores_toframe.sort_values(by = 'P_value')

# important factors: p_value<6.2%
chi_scores_toframe[(chi_scores_toframe['P_value']< 0.062)].sort_values(by = 'P_value')['P_value'].to_frame()
# exclude country-related factors in the results because there are not enough representatives from each country to draw a conclusion

Unnamed: 0,P_value
"Before you moved to China, did you plan to go home or stay in China after graduation?",1e-06
What is your primary source of funding for your education in Beihang?,5e-06
Where do your parents live?,0.00013
Is your plan now different from your initial plan before moving to China?,0.005166
What country do you regard as your home country?,0.015219
How long have you lived in China?,0.01925
Have you signed a contract stating you must return to work in your home country after your graduation?,0.038176
Do you have pressure to return home after graduation from your family?,0.0614


## Use Logistic regression to rank the importance of important factors 
### (based on t-test and Chi-square results)

In [11]:
likert = pd.read_excel("likert_stayorleave.xlsx", index_col = "Person")
likert
not_likert = pd.read_excel("Not_likert_question_Chisquare.xlsx", index_col = "Person")
not_likert
important_factors = pd.DataFrame.merge(likert, not_likert, on = ("Person", "Do you plan to stay and work in China or go home after your graduation?"))
important_factors= important_factors[['Do you plan to stay and work in China or go home after your graduation?', 'Job opportunities ', 
    'Opportunities to advance your career ',
    'Starting salary after completing your studies ', 'Before you moved to China, did you plan to go home or stay in China after graduation?',
    'What is your primary source of funding for your education in Beihang?',
    'Is your plan now different from your initial plan before moving to China?',
    'How long have you lived in China?',
    'Have you signed a contract stating you must return to work in your home country after your graduation?',
    'Do you have pressure to return home after graduation from your family?']]
important_factors = important_factors[important_factors['Do you plan to stay and work in China or go home after your graduation?'].isin(['Stay and work in China', 'Go home'])]
important_factors
important_factors.isna().any() 
important_factors['Do you have pressure to return home after graduation from your family?'].fillna(important_factors['Do you have pressure to return home after graduation from your family?'].mode()[0], inplace=True)
# change values of columns to numeric values
for col in important_factors.columns:
    le = LabelEncoder()
    important_factors[col] = le.fit_transform(important_factors[col])
important_factors
important_question = important_factors.drop("Do you plan to stay and work in China or go home after your graduation?", axis =1)

#X represents independent variables
x = important_question
y = important_factors['Do you plan to stay and work in China or go home after your graduation?']
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x, y)
model.classes_
model.intercept_
list(model.coef_)[0]

array([ 0.0212244 ,  0.02756472, -0.2375122 ,  1.21009257,  0.16353771,
        1.02742879,  0.07491513, -0.604583  , -0.58651405])

In [12]:
#Combine important factors with its coefficients
keys = list(important_question.columns)
values = list(model.coef_)[0]
dictionary = dict(zip(keys, values))
print(dictionary)
s=pd.Series(dictionary)

# sort coefficients based on its absolute values
s.abs().sort_values(ascending = False).to_frame(name = 'Coefficients')

{'Job opportunities ': 0.021224398285053892, 'Opportunities to advance your career ': 0.027564716909741133, 'Starting salary after completing your studies ': -0.2375121955499771, 'Before you moved to China, did you plan to go home or stay in China after graduation?': 1.2100925713406243, 'What is your primary source of funding for your education in Beihang?': 0.16353770870496917, 'Is your plan now different from your initial plan before moving to China?': 1.0274287931030126, 'How long have you lived in China?': 0.07491513249876329, 'Have you signed a contract stating you must return to work in your home country after your graduation?': -0.6045830046402935, 'Do you have pressure to return home after graduation from your family?': -0.586514054574651}


Unnamed: 0,Coefficients
"Before you moved to China, did you plan to go home or stay in China after graduation?",1.210093
Is your plan now different from your initial plan before moving to China?,1.027429
Have you signed a contract stating you must return to work in your home country after your graduation?,0.604583
Do you have pressure to return home after graduation from your family?,0.586514
Starting salary after completing your studies,0.237512
What is your primary source of funding for your education in Beihang?,0.163538
How long have you lived in China?,0.074915
Opportunities to advance your career,0.027565
Job opportunities,0.021224


In [None]:
# => Conclusion: Top 3 Most Important Factors: Initial Plan, Pressure to return home and Job-related Factors