In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Cleaning dataframe

In [81]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [82]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


## Dropping empty fields.

## df.dropna(
    axis=0, 
    how='any', 
    thresh=None, 
    subset=None, 
    inplace=False
    )
    
    axis = 0 or 'index' / 1 or 'columns'
    how  = 'any' / 'all'
    thresh = 4 --->  keep only those rows which have atleast 4 non-NA values.
    subset = list of those columns in which to look for missing values.

In [83]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [84]:
df.dropna(axis='columns')

0
1
2
3
4
5
6


In [85]:
df.dropna(how = 'all')   # delete those rows in which all values are NA.
                         # So 4th index got dropped.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [86]:
df.dropna(subset=['email','first'])   # drop rows where age or first is NA.

# so basically in this subset we will pass all the necessary fields which should not
# be empty. So if any of the coulmn is NA then drop it.

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


## isna()
- to find out which values are na and which are not.
- it will show a table of true and false values.

In [87]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


# fillna(x)
- to fill NA values with a custom value.

In [88]:
df.fillna('value not given')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,value not given,36
4,value not given,value not given,value not given,value not given
5,value not given,value not given,Anonymous@email.com,value not given
6,,Missing,,Missing


In [89]:
df.fillna(0,inplace=True)   # filling NA values with 0
df= df.drop(index=6)        # removing last index as it contain string "missing" 
                            # and we can't perfom mathematical operations on string.
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0


In [90]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

### finding average age.

In [91]:
# changing datatype of df['age'] to int type.

df['age'] = df['age'].astype(int)

df.dtypes

first    object
last     object
email    object
age       int32
dtype: object

In [92]:
df['age'].mean()

31.166666666666668

### lets us see a example.

In [93]:
people = {
    'first': ['Corey', 'Jane', 'John'], 
    'last': ['Schafer', 'Doe', 'Doe'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'],
    'age': ['33', '55', np.nan]
}

tempDf = pd.DataFrame(people)
tempDf

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,


In [94]:
tempDf.fillna(0,inplace=True)
tempDf

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,0


### finding average age.

In [95]:
tempDf['age'] = tempDf['age'].astype(int)
tempDf['age'].mean()

29.333333333333332

## to convert certain entries in a file to NaN.

# Real World Data Analysis

In [130]:
df = pd.read_csv("StackOverFlowSurvey2019/survey_results_public.csv")
QuestionDf = pd.read_csv("StackOverFlowSurvey2019/survey_results_schema.csv",index_col = 'Column')


In [97]:
QuestionDf

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
OpenSourcer,How often do you contribute to open source?
OpenSource,How do you feel about the quality of open sour...
...,...
Sexuality,Which of the following do you currently identi...
Ethnicity,Which of the following do you identify as? Ple...
Dependents,"Do you have any dependents (e.g., children, el..."
SurveyLength,How do you feel about the length of the survey...


In [98]:
df

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88878,88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy
88879,88601,,No,Never,The quality of OSS and closed source software ...,,,,,,...,,,,,,,,,,
88880,88802,,No,Never,,Employed full-time,,,,,...,,,,,,,,,,
88881,88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,...,,,,,,,,,,


# <font color = "purple">find average years of experience.</font>

In [99]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

In [107]:
# lets look at the data first
df['YearsCode'].head(20)

0       4
1     NaN
2       3
3       3
4      16
5      13
6       6
7       8
8      12
9      12
10      2
11      5
12     17
13     13
14      3
15     10
16      5
17     10
18     14
19      8
Name: YearsCode, dtype: object

In [123]:
#df['YearsCode'].fillna(0)   # filling nan values with 0
df['YearsCode'].astype(float)          # converting this yearscode column to int type.

ValueError: could not convert string to float: 'Less than 1 year'

In [124]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [131]:
# replacing string values with appropriate values.
df['YearsCode'].replace({'Less than 1 year':0 , 'More than 50 years': 50} , inplace=True)

df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 50, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [162]:
# changing dtype of yearsCode column
df['YearsCode'] = df['YearsCode'].astype(float)

In [164]:
df['YearsCode'].fillna(0)

0         4.0
1         0.0
2         3.0
3         3.0
4        16.0
         ... 
88878     0.0
88879     0.0
88880     0.0
88881     0.0
88882     8.0
Name: YearsCode, Length: 88883, dtype: float64

In [165]:
df['YearsCode'].mean()

11.660681389160544

In [166]:
df['YearsCode'].median()

9.0

# <font color = "purple">find country wise average years of experience.</font>

In [167]:
country_grp = df[['Country','YearsCode']].groupby('Country')
country_grp.first()

Unnamed: 0_level_0,YearsCode
Country,Unnamed: 1_level_1
Afghanistan,0.0
Albania,17.0
Algeria,4.0
Andorra,8.0
Angola,4.0
...,...
"Venezuela, Bolivarian Republic of...",4.0
Viet Nam,11.0
Yemen,8.0
Zambia,7.0


In [168]:
country_grp['YearsCode'].value_counts().loc['Afghanistan']

YearsCode
2.0     10
3.0      6
0.0      5
7.0      4
10.0     3
1.0      2
4.0      2
5.0      1
6.0      1
8.0      1
9.0      1
20.0     1
28.0     1
31.0     1
36.0     1
50.0     1
Name: YearsCode, dtype: int64

In [169]:
# top 20 countries with max mean.
country_grp['YearsCode'].apply(lambda x: x.mean()).nlargest(20)

Country
Papua New Guinea    45.000000
Monaco              27.000000
Dominica            23.000000
Gabon               17.000000
Luxembourg          16.016393
Australia           15.230892
Djibouti            15.000000
United Kingdom      14.736243
New Zealand         14.452975
Sweden              14.387046
Denmark             14.255738
Timor-Leste         14.000000
United States       13.970996
Norway              13.961538
Finland             13.720074
Italy               13.657508
Ireland             13.643287
Seychelles          13.500000
Panama              13.393939
Canada              13.269917
Name: YearsCode, dtype: float64

In [171]:
country_grp['YearsCode'].apply(lambda x: x.median()).nlargest(20)

Country
Papua New Guinea                         45.0
Monaco                                   27.5
Dominica                                 23.0
Gabon                                    17.0
Djibouti                                 15.0
Luxembourg                               15.0
Timor-Leste                              14.0
Seychelles                               13.5
Australia                                13.0
Democratic People's Republic of Korea    12.5
Finland                                  12.0
Sweden                                   12.0
United Kingdom                           12.0
Cuba                                     11.5
Denmark                                  11.0
Italy                                    11.0
New Zealand                              11.0
Norway                                   11.0
Panama                                   11.0
Switzerland                              10.5
Name: YearsCode, dtype: float64

In [179]:
# looking at india's performance
print("Mean Exp. =",country_grp['YearsCode'].apply(lambda x: x.mean()).loc['India'])
print("Median Exp. =",country_grp['YearsCode'].apply(lambda x: x.median()).loc['India'])

Mean Exp. = 6.556418918918919
Median Exp. = 6.0
