In [None]:
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import linear_model
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [None]:
data = pd.read_csv('../input/survey_results_public.csv')
dataSchema = pd.read_csv('../input/survey_results_schema.csv')
data.drop(inplace=True,columns=['Salary', 'Currency','SalaryType','Respondent', 'AssessJob1', 'AssessJob2', \
                                'AssessJob3', 'AssessJob4', 'AssessJob5', 'AssessJob6', 'AssessJob7', \
                                'AssessJob8', 'AssessJob9', 'AssessJob10', 'AssessBenefits1', 'AssessBenefits2', \
                                'AssessBenefits3', 'AssessBenefits4', 'AssessBenefits5', 'AssessBenefits6', \
                                'AssessBenefits7', 'AssessBenefits8', 'AssessBenefits9', 'AssessBenefits10', \
                                'AssessBenefits11', 'JobContactPriorities1', 'JobContactPriorities2', \
                                'JobContactPriorities3', 'JobContactPriorities4', 'JobContactPriorities5', \
                                'JobEmailPriorities1', 'JobEmailPriorities2', 'JobEmailPriorities3', \
                                'JobEmailPriorities4', 'JobEmailPriorities5', 'JobEmailPriorities6', \
                                'JobEmailPriorities7', 'UpdateCV', 'CurrencySymbol', 'CommunicationTools', \
                                'EducationTypes', 'SelfTaughtTypes', 'TimeAfterBootcamp', 'HackathonReasons', \
                                'AgreeDisagree1', 'AgreeDisagree2', 'AgreeDisagree3', \
                                'DatabaseDesireNextYear', 'PlatformWorkedWith', 'PlatformDesireNextYear', \
                                'FrameworkDesireNextYear', 'IDE', 'OperatingSystem', \
                                'Methodology', 'VersionControl', 'CheckInCode', 'AdBlocker', 'AdBlockerDisable', \
                                'AdBlockerReasons', 'AdsAgreeDisagree1', 'AdsAgreeDisagree2', \
                                'AdsAgreeDisagree3', 'AdsActions', 'AdsPriorities1', 'AdsPriorities2', 'AdsPriorities3', \
                                'AdsPriorities4', 'AdsPriorities5', 'AdsPriorities6', 'AdsPriorities7', \
                                'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture', 'EthicsChoice', \
                                'LanguageDesireNextYear', 'EthicsReport', 'EthicsResponsible', \
                                'EthicsResponsible', 'StackOverflowRecommend', 'StackOverflowVisit', \
                                'StackOverflowHasAccount', 'StackOverflowParticipate', 'StackOverflowJobs', \
                                'StackOverflowDevStory', 'StackOverflowJobsRecommend', 'StackOverflowConsiderMember', \
                                'HypotheticalTools1', 'HypotheticalTools2', 'HypotheticalTools3', \
                                'HypotheticalTools4', 'HypotheticalTools5', 'ErgonomicDevices', 'SexualOrientation', \
                                'Dependents', 'MilitaryUS', 'SurveyTooLong', 'SurveyEasy', \
                                'EthicalImplications', 'JobSearchStatus'])
data.head()

In [None]:
formalEdDiction = {
                    'Bachelor’s degree (BA, BS, B.Eng., etc.)':16,
                    'Some college/university study without earning a degree':14,
                    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)':12,
                    'Master’s degree (MA, MS, M.Eng., MBA, etc.)':18,
                    'Primary/elementary school':12,
                    'Associate degree':14,
                    'They never completed any formal education':0,
                    'Other doctoral degree (Ph.D, Ed.D., etc.)':20,
                    'Professional degree (JD, MD, etc.)':20,
                     np.nan: 0
                  }

data.dropna(subset=['EducationParents'],inplace=True)
data['EducationParents']=data['EducationParents'].map(formalEdDiction)

In [None]:
undergradMajors = {
                    'A business discipline (ex. accounting, finance, marketing)':0,
                    'A health science (ex. nursing, pharmacy, radiology)':0,
                    'A humanities discipline (ex. literature, history, philosophy)':0,
                    'A natural science (ex. biology, chemistry, physics)':0,
                    'A social science (ex. anthropology, psychology, political science)':0,
                    'Another engineering discipline (ex. civil, electrical, mechanical)':1,
                    'Computer science, computer engineering, or software engineering':1,
                    'Fine arts or performing arts (ex. graphic design, music, studio art)':0,
                    'I never declared a major':0,
                    'Information systems, information technology, or system administration':1,
                    'Mathematics or statistics':1,
                    'Web development or web design': 1,
                    np.nan: 0
                  }

data['UndergradMajor']=data['UndergradMajor'].map(undergradMajors)

In [None]:
formalEdDiction = {
                    'I never completed any formal education':0,
                    'Primary/elementary school':6,
                    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)':12,
                    'Associate degree':14,
                    'Some college/university study without earning a degree':14,
                    'Bachelor’s degree (BA, BS, B.Eng., etc.)':16,
                    'Master’s degree (MA, MS, M.Eng., MBA, etc.)':18,
                    'Other doctoral degree (Ph.D, Ed.D., etc.)':20,
                    'Professional degree (JD, MD, etc.)':20
                  }

data.dropna(subset=['FormalEducation'],inplace=True)
data['FormalEducation']=data['FormalEducation'].map(formalEdDiction)

In [None]:
compSizeDiction = {
                        'Fewer than 10 employees': 9, 
                        '20 to 99 employees':99, 
                        '5,000 to 9,999 employees':9999, 
                        '10 to 19 employees':19, 
                        '10,000 or more employees':15000, 
                        '500 to 999 employees':999, 
                        '1,000 to 4,999 employees':5000, 
                        '100 to 499 employees':499
                  }

data.dropna(subset=['CompanySize'],inplace=True)
data['CompanySize']=data['CompanySize'].map(compSizeDiction)

In [None]:
yearCodingDiction = {
    
                     '3-5 years':5, 
                     '24-26 years':26, 
                     '15-17 years':17, 
                     '21-23 years':23, 
                     '9-11 years':11, 
                     '27-29 years':29, 
                     '6-8 years':8, 
                     '18-20 years':20, 
                     '12-14 years':14, 
                     '30 or more years':35,
                     '0-2 years':2
    
                    }

data.dropna(subset=['YearsCoding'],inplace=True)
data['YearsCoding']=data['YearsCoding'].map(yearCodingDiction)
data.dropna(subset=['YearsCodingProf'],inplace=True)
data['YearsCodingProf']=data['YearsCodingProf'].map(yearCodingDiction)

In [None]:
hoursCompDiction = {
                    
                    'Less than 1 hour':1,
                    '5 - 8 hours':8,
                    '1 - 4 hours':4,
                    'Over 12 hours':14, 
                    '9 - 12 hours':12

                   }

data.dropna(subset=["HoursComputer"],inplace=True)
data["HoursComputer"]=data["HoursComputer"].map(hoursCompDiction)

In [None]:
hoursOutDiction = {
                    '1 - 2 hours':2.0,
                    '3 - 4 hours':4.0,
                    '30 - 59 minutes':1.0,
                    'Less than 30 minutes':0.5,
                    'Over 4 hours':5.0 
                  }

data.dropna(subset=["HoursOutside"],inplace=True)
data["HoursOutside"]=data["HoursOutside"].map(hoursOutDiction)

In [None]:
skipMealDiction = {
                    '1 - 2 times per week':2,
                    '3 - 4 times per week':4,
                    'Daily or almost every day':7,
                    'Never':0,
                    np.nan:0     
                  }

data["SkipMeals"]=data["SkipMeals"].map(skipMealDiction)

In [None]:
exerciseDiction = {
                    '1 - 2 times per week':2,
                    '3 - 4 times per week':4,
                    'Daily or almost every day':7,
                    "I don't typically exercise":0,
                    np.nan:0
                  }

data['Exercise']=data['Exercise'].map(exerciseDiction)

In [None]:
satisfactionDict = {
    
                       'Extremely dissatisfied': 0,
                       'Moderately dissatisfied': 0,
                       'Slightly dissatisfied': 0,
                       'Neither satisfied nor dissatisfied': 0,
                       'Slightly satisfied': 1,
                       'Moderately satisfied': 1,
                       'Extremely satisfied': 1,
                       np.nan: np.nan
    
                    }

data.dropna(subset = ['JobSatisfaction', 'CareerSatisfaction'], inplace=True)
data['JobSatisfaction'] = data['JobSatisfaction'].map(satisfactionDict)
data['CareerSatisfaction'] = data['CareerSatisfaction'].map(satisfactionDict)

In [None]:
ageDiction  = {
                   '18 - 24 years old':24,
                   '25 - 34 years old':34,
                   '35 - 44 years old':44,
                   '45 - 54 years old':54,
                   '65 years or older':75,
                   '55 - 64 years old':64,
                   'Under 18 years old':18,
                   np.nan:45
              }

data['Age']=data['Age'].map(ageDiction)

In [None]:
productiveDiction = {
                        'Less than a month':30,
                        'More than a year':545,
                        'Nine months to a year':365,
                        'One to three months':90,
                        'Six to nine months':270,
                        'Three to six months':180,
                        np.nan : 180
                    }

data['TimeFullyProductive'] = data['TimeFullyProductive'].map(productiveDiction)

In [None]:
wakeDiction = {
                 'After 12:01 PM':12,
                 'Before 5:00 AM':4,
                 'Between 10:01 - 11:00 AM':10,
                 'Between 11:01 AM - 12:00 PM':11,
                 'Between 5:00 - 6:00 AM':5,
                 'Between 6:01 - 7:00 AM':6,
                 'Between 7:01 - 8:00 AM':7,
                 'Between 8:01 - 9:00 AM':8,
                 'Between 9:01 - 10:00 AM':9,
                 'I do not have a set schedule':9,
                 'I work night shifts':21,
                 np.nan :7
              }

data['WakeTime'] = data['WakeTime'].map(wakeDiction)

In [None]:
lastJobDic = {      
                    'Between 1 and 2 years ago':24,
                    'Between 2 and 4 years ago':48,
                    "I've never had a job": np.nan,
                    'Less than a year ago': 12,
                    'More than 4 years ago': 72,
                    np.nan:np.nan
             }

data['LastNewJob'] = data['LastNewJob'].map(lastJobDic)
data.dropna(subset = ['LastNewJob'], inplace=True)

In [None]:
hopeFiveYearsDic = {
    
                    'Doing the same work':1,
                    'Retirement':0,
                    'Working as a founder or co-founder of my own company':10,
                    'Working as a product manager or project manager':8,
                    'Working as an engineering manager or other functional manager': 7,
                    'Working in a career completely unrelated to software development':4,
                    'Working in a different or more specialized technical role than the one I\'m in now':6,
                    np.nan:5,
    
                   }

data.dropna(subset = ['HopeFiveYears'], inplace=True)
data['HopeFiveYears'] = data['HopeFiveYears'].map(hopeFiveYearsDic)
data['OpenSource'] = data['OpenSource'].map(dict(Yes=1,No=0))
data['Hobby'] = data['Hobby'].map(dict(Yes=1,No=0))

In [None]:
data['NumberMonitors'] = data['NumberMonitors'].map({np.nan:1, '2':2, '4':4, 'More than 4':6, '3':3, '1':1})

In [None]:
studDict = {        
               'No':0,
               'Yes, full-time': 2,
               "Yes, part-time": 1,
               np.nan:np.nan
            }

data['Student'] = data['Student'].map(studDict)
data.dropna(subset = ['Student'], inplace=True)

In [None]:
databaseSet = {'nan'}
for strings in list(set(list(data['DatabaseWorkedWith']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        databaseSet.add(s)
databaseSet.remove('nan')

def databaseTypeBool(stringDatabase,databaseType):
    if pd.isnull(stringDatabase):
        return 0
    
    databaseArr = stringDatabase.split(';')
    if databaseType in databaseArr:
        return 1
    return 0

def databaseTypeTransform(data,databaseList):
    for database in databaseList:
        data[database] = data['DatabaseWorkedWith'].apply(databaseTypeBool,databaseType = database)

databaseTypeTransform(data,list(databaseSet))
data.drop(columns = ['DatabaseWorkedWith'], inplace = True)

In [None]:
frameworkSet = {'nan'}
for strings in list(set(list(data['FrameworkWorkedWith']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        frameworkSet.add(s)
frameworkSet.remove('nan')

def frameworkTypeBool(stringFrameWork,frameworkType):
    if pd.isnull(stringFrameWork):
        return 0
    
    frameworkArr = stringFrameWork.split(';')
    if frameworkType in frameworkArr:
        return 1
    return 0

def frameworkTypeTransform(data,frameworkList):
    for framework in frameworkList:
        data[framework] = data['FrameworkWorkedWith'].apply(frameworkTypeBool,frameworkType = framework)

frameworkTypeTransform(data,list(frameworkSet))
data.drop(columns = ['FrameworkWorkedWith'], inplace = True)

In [None]:
langSet = {'C'}
for strings in list(set(list(data['LanguageWorkedWith']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        langSet.add(s)
langSet.remove('nan')

def langTypeBool(stringLang,langType):
    if pd.isnull(stringLang):
        return 0
    
    langArr = stringLang.split(';')
    if langType in langArr:
        return 1
    return 0

def langTypeTransform(data,langList):
    for lang in langList:
        data[lang] = data['LanguageWorkedWith'].apply(langTypeBool,langType = lang)

langTypeTransform(data,list(langSet))
data.drop(columns = ['LanguageWorkedWith'], inplace = True)

In [None]:
ethSet = {'White or of European descent'}
for strings in list(set(list(data['RaceEthnicity']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        ethSet.add(s)
ethSet.remove('nan')

def ethTypeBool(stringEth,ethType):
    if pd.isnull(stringEth):
        return 0
    
    ethArr = stringEth.split(';')
    if ethType in ethArr:
        return 1
    return 0

def ethTypeTransform(data,ethList):
    for ethType in ethList:
        data[ethType] = data['RaceEthnicity'].apply(ethTypeBool,ethType=ethType)

ethTypeTransform(data,list(ethSet))
data.drop(columns = ['RaceEthnicity'], inplace = True)

In [None]:
genSet = {'Male'}
for strings in list(set(list(data['Gender']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        genSet.add(s)
genSet.remove('nan')

def genTypeBool(stringGen,genType):
    if pd.isnull(stringGen):
        return 0
    
    genArr = stringGen.split(';')
    if genType in genArr:
        return 1
    return 0

def genTypeTransform(data,genList):
    for genType in genList:
        data[genType] = data['Gender'].apply(genTypeBool,genType=genType)

genTypeTransform(data,list(genSet))
data.drop(columns = ['Gender'], inplace = True)

In [None]:
devSet = {'Back-end developer'}
for strings in list(set(list(data['DevType']))):
    if strings == np.nan:
        continue
    for s in str(strings).split(';'):
        devSet.add(s)
devSet.remove('nan')

def devTypeBool(stringDev,devType):
    if pd.isnull(stringDev):
        return 0
    
    devArr = stringDev.split(';')
    if devType in devArr:
        return 1
    return 0

def devTypeTransform(data,devList):
    for devType in devList:
        data[devType] = data['DevType'].apply(devTypeBool,devType=devType)

devTypeTransform(data,list(devSet))
data.drop(columns = ['DevType'], inplace = True)

In [None]:
data.dropna(subset = ['ConvertedSalary'], inplace=True)
data['ConvertedSalary'].describe().apply(lambda x: format(x, 'f'))

In [None]:
dataNoOutlier = data[np.abs(data['ConvertedSalary']- data['ConvertedSalary'].mean()) <= (0.5 * data['ConvertedSalary'].std())]
dataNoOutlier = dataNoOutlier.loc[dataNoOutlier['Country'] == 'United States']
dataNoOutlier = dataNoOutlier.loc[dataNoOutlier['Employment'] == 'Employed full-time']
dataNoOutlier = dataNoOutlier.loc[dataNoOutlier['ConvertedSalary'] > 17000]

myBins = np.array([19999, 50000, 75000, 100000, 150000, 20000000])
dataNoOutlier['SalaryRange'] = np.digitize(np.array(dataNoOutlier['ConvertedSalary']),myBins,right=False) - 1

dataNoOutlier.drop(columns = ['Employment', 'Country'], inplace = True)

In [None]:
dataNoOutlier['ConvertedSalary'].describe().apply(lambda x: format(x, 'f'))

In [None]:
sns.boxplot(x=dataNoOutlier['JobSatisfaction'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['Student'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.boxplot(hue = data['JobSatisfaction'], x=data['FormalEducation'], y = data['ConvertedSalary'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['HopeFiveYears'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.distplot(dataNoOutlier['ConvertedSalary'], hist=True,norm_hist=False,rug=True,kde=False)

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['LastNewJob'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['LastNewJob'],)

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['OpenSource'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['OpenSource'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['Hobby'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(hue = 'JobSatisfaction', x='Hobby', data=dataNoOutlier)

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['WakeTime'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(x=dataNoOutlier['WakeTime'],hue=dataNoOutlier['JobSatisfaction'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['Exercise'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['Exercise'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['FormalEducation'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot( x=dataNoOutlier['FormalEducation'])

In [None]:
sns.regplot(x=dataNoOutlier['FormalEducation'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['HoursComputer'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['CompanySize'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.regplot(x=dataNoOutlier['CompanySize'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['TimeFullyProductive'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.countplot(hue = dataNoOutlier['JobSatisfaction'],x=dataNoOutlier['TimeFullyProductive'])

In [None]:
sns.boxplot(hue = dataNoOutlier['JobSatisfaction'], x=dataNoOutlier['YearsCodingProf'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
sns.regplot(x=dataNoOutlier['YearsCodingProf'], y = dataNoOutlier['ConvertedSalary'])

In [None]:
# is undergrad major cs related?
sns.countplot(x=dataNoOutlier['UndergradMajor'], hue=dataNoOutlier['JobSatisfaction'])

In [None]:
dataNoOutlier.describe()

In [None]:
print(dataNoOutlier.shape)
x_train, x_test, y_train, y_test = train_test_split(dataNoOutlier.drop(columns = ['JobSatisfaction', 'SalaryRange']),\
                                                    dataNoOutlier.loc[:, ['ConvertedSalary', 'JobSatisfaction', 'SalaryRange']],\
                                                    test_size = 0.25, \
                                                    random_state = 1);
print([x_train.shape, x_test.shape])
x_train.head()

In [None]:
x_train_lin = x_train.drop(columns = devSet)
x_train_lin = x_train_lin.drop(columns = ['CompanySize', 'NumberMonitors', 'ConvertedSalary'])

x_test_lin = x_test.drop(columns = devSet)
x_test_lin = x_test_lin.drop(columns = ['CompanySize', 'NumberMonitors', 'ConvertedSalary'])

scaler = StandardScaler()
scaler.fit(x_train_lin)
x_train_lin_scaled = scaler.transform(x_train_lin)
x_test_lin_scaled = scaler.transform(x_test_lin)

In [None]:
model = linear_model.Lasso(alpha = 5)
predicted = model.fit(X = x_train_lin_scaled, y = y_train['ConvertedSalary'])
print(predicted.coef_)
print(predicted.intercept_)

X = x_train_lin_scaled
y = y_train['ConvertedSalary']

X2 = x_test_lin_scaled
y2 = y_test['ConvertedSalary']

print(model.score(X, y), 1 - (1-model.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)) # R Squared / Adjusted R Squared
print(model.score(X2, y2), 1 - (1-model.score(X2, y2))*(len(y2)-1)/(len(y2)-X2.shape[1]-1)) # R Squared / Adjusted R Squared

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
#Terrible even though it knows the actual salary
classifier = KNeighborsClassifier(n_neighbors = 6)
classifier.fit(x_train_scaled, y_train['SalaryRange'])
y_pred = classifier.predict(x_test_scaled)
conf_matrix = metrics.confusion_matrix(y_test['SalaryRange'], y_pred)
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(y_test['SalaryRange'], y_pred)
error = 1 - metrics.accuracy_score(y_test['SalaryRange'], y_pred)
precision = metrics.precision_score(y_test['SalaryRange'], y_pred, average = None)
recall = metrics.recall_score(y_test['SalaryRange'], y_pred, average = None)
F1_score = metrics.f1_score(y_test['SalaryRange'], y_pred, average = None)
print([accuracy, error, precision, recall, F1_score])

In [None]:
classifier = GaussianNB()
classifier.fit(x_train_scaled, y_train['JobSatisfaction'])
y_pred = classifier.predict(x_test_scaled)
conf_matrix = metrics.confusion_matrix(y_test['JobSatisfaction'], y_pred)
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
error = 1 - metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
precision = metrics.precision_score(y_test['JobSatisfaction'], y_pred, average = None)
recall = metrics.recall_score(y_test['JobSatisfaction'], y_pred, average = None)
F1_score = metrics.f1_score(y_test['JobSatisfaction'], y_pred, average = None)
print([accuracy, error, precision, recall, F1_score])

In [None]:
classifier = SVC(kernel='rbf')
classifier.fit(x_train_scaled, y_train['JobSatisfaction'])
y_pred = classifier.predict(x_test_scaled)
conf_matrix = metrics.confusion_matrix(y_test['JobSatisfaction'], y_pred)
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()
len(classifier.support_vectors_)/len(x_train_scaled)

In [None]:
accuracy = metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
error = 1 - metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
precision = metrics.precision_score(y_test['JobSatisfaction'], y_pred, average = None)
recall = metrics.recall_score(y_test['JobSatisfaction'], y_pred, average = None)
F1_score = metrics.f1_score(y_test['JobSatisfaction'], y_pred, average = None)
print([accuracy, error, precision, recall, F1_score])

In [None]:
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train_scaled,y_train['JobSatisfaction'])

y_pred = classifier.predict(x_test_scaled)
conf_matrix = metrics.confusion_matrix(y_test['JobSatisfaction'],y_pred)
sns.heatmap(conf_matrix,annot=True,fmt='.3f',square=True,cmap=plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
error = 1 - metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
precision = metrics.precision_score(y_test['JobSatisfaction'], y_pred, average = None)
recall = metrics.recall_score(y_test['JobSatisfaction'], y_pred, average = None)
F1_score = metrics.f1_score(y_test['JobSatisfaction'], y_pred, average = None)
print([accuracy, error, precision, recall, F1_score])

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 6)
classifier.fit(x_train_scaled,y_train['JobSatisfaction'])

y_pred = classifier.predict(x_test_scaled)
conf_matrix = metrics.confusion_matrix(y_test['JobSatisfaction'],y_pred)
sns.heatmap(conf_matrix,annot=True,fmt='.3f',square=True,cmap=plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()

In [None]:
accuracy = metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
error = 1 - metrics.accuracy_score(y_test['JobSatisfaction'], y_pred)
precision = metrics.precision_score(y_test['JobSatisfaction'], y_pred, average = None)
recall = metrics.recall_score(y_test['JobSatisfaction'], y_pred, average = None)
F1_score = metrics.f1_score(y_test['JobSatisfaction'], y_pred, average = None)
print([accuracy, error, precision, recall, F1_score])

In [None]:
clustering = linkage(x_train_scaled, 'single', metric='euclidean')
clusters = fcluster(clustering, 2, criterion = 'maxclust')
cont_matrix = metrics.confusion_matrix(y_train['JobSatisfaction'], clusters - 1)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(y_train['JobSatisfaction'], clusters - 1)
silhouette_coefficient = metrics.silhouette_score(x_train_scaled, clusters - 1, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
clustering = linkage(x_train_scaled, 'complete', metric='euclidean')
clusters = fcluster(clustering, 2, criterion = 'maxclust')
cont_matrix = metrics.confusion_matrix(y_train['JobSatisfaction'], clusters - 1)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(y_train['JobSatisfaction'], clusters - 1)
silhouette_coefficient = metrics.silhouette_score(x_train_scaled, clusters - 1, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 200, random_state = 0).fit(x_train_scaled)
clusters = clustering.labels_
cont_matrix = metrics.confusion_matrix(y_train['JobSatisfaction'], clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(y_train['JobSatisfaction'], clusters)
silhouette_coefficient = metrics.silhouette_score(x_train_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
clustering = DBSCAN(eps = 10, min_samples = 100, metric = "euclidean").fit(x_train_scaled)
clusters = clustering.labels_
cont_matrix = metrics.confusion_matrix(y_train['JobSatisfaction'], clusters + 1)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(y_train['JobSatisfaction'], clusters + 1)
silhouette_coefficient = metrics.silhouette_score(x_train_scaled, clusters + 1, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])