# Part 1: Pandas

In [46]:
import pandas as pd

In [47]:
df = pd.read_csv('data/survey_results_public.csv')

In [48]:
df.head(10); # ';' hides the output of the shell

In [49]:
df.shape

(88883, 85)

#### info method, provides no. of rows and columns and data type of all of the columns.

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88883 entries, 0 to 88882
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Respondent              88883 non-null  int64  
 1   MainBranch              88331 non-null  object 
 2   Hobbyist                88883 non-null  object 
 3   OpenSourcer             88883 non-null  object 
 4   OpenSource              86842 non-null  object 
 5   Employment              87181 non-null  object 
 6   Country                 88751 non-null  object 
 7   Student                 87014 non-null  object 
 8   EdLevel                 86390 non-null  object 
 9   UndergradMajor          75614 non-null  object 
 10  EduOther                84260 non-null  object 
 11  OrgSize                 71791 non-null  object 
 12  DevType                 81335 non-null  object 
 13  YearsCode               87938 non-null  object 
 14  Age1stCode              87634 non-null

#### pd.set_option(), displaying all the columns and rows

In [6]:
pd.set_option('display.max_columns',85)
#pd.set_option('display.max_rows',85)

#### ';' semicolon hides the output of the cell also 'o' on the command mode does the same thing

In [7]:
df;

#### comparing column names and questions

In [8]:
schema_df = pd.read_csv('data/survey_results_schema.csv')

In [9]:
schema_df.head(85)

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,OpenSourcer,How often do you contribute to open source?
4,OpenSource,How do you feel about the quality of open sour...
...,...,...
80,Sexuality,Which of the following do you currently identi...
81,Ethnicity,Which of the following do you identify as? Ple...
82,Dependents,"Do you have any dependents (e.g., children, el..."
83,SurveyLength,How do you feel about the length of the survey...


# Part 2: Dataframe in python
It's like dictionary, where dict keys act as columns and dict values as rows which are in a list. for example

In [52]:
people = {
    'first': ['prasiddha','ram','hari'],
    'last': ['pokhrel','regmi','poudel'],
    'email': ['prasiddhapokhrel@gmail.com','ram@email.com','hari@email.com']
}

In [53]:
people['first']

['prasiddha', 'ram', 'hari']

In [54]:
import pandas as pd

In [55]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddhapokhrel@gmail.com
1,ram,regmi,ram@email.com
2,hari,poudel,hari@email.com


In [56]:
df['email']
# type(df['email'])

pandas.core.series.Series

##### df.email also gives the same result, but df['email']  is prefered because if column has same name as dataframe's method, like df.count then it would give some errors so df['email'] is preffered

#### Accessing multiple columns

In [15]:
df[['first','last']]

Unnamed: 0,first,last
0,prasiddha,pokhrel
1,ram,regmi
2,hari,poudel


### Getting all the columns

In [16]:
df.columns

Index(['first', 'last', 'email'], dtype='object')

### Getting all the rows
In order to get rows we can use loc and iloc,
iloc allows us to access rows by integer location hence iloc-> integer location

#### Accessing first row by iloc

In [17]:
df.iloc[0]

first                     prasiddha
last                        pokhrel
email    prasiddhapokhrel@gmail.com
Name: 0, dtype: object

#### Accessing multiple rows
By passing list of index

In [18]:
df.iloc[[0,1]]

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddhapokhrel@gmail.com
1,ram,regmi,ram@email.com


#### Accessing row and columns by using iloc

For that first we pass rows as arguments and columns for example:
df.iloc[[0,1],0] returns 0 and 1 row with 0th column (first column)

In [19]:
df.iloc[[0,1],0]

0    prasiddha
1          ram
Name: first, dtype: object

In [20]:
df.iloc[[0,1],[0,2]]

Unnamed: 0,first,email
0,prasiddha,prasiddhapokhrel@gmail.com
1,ram,ram@email.com


## Let's search by loc
With loc we are going to be searching by labels, labels for rows will be default range of integers.
It will be somewhat similar to iloc, but we will look usecases of loc later

In [21]:
df.loc[0]

first                     prasiddha
last                        pokhrel
email    prasiddhapokhrel@gmail.com
Name: 0, dtype: object

In [58]:
df.loc[[0,1]]

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddhapokhrel@gmail.com
1,ram,regmi,ram@email.com


In [23]:
df.loc[[0,1],['last','email']]

Unnamed: 0,last,email
0,pokhrel,prasiddhapokhrel@gmail.com
1,regmi,ram@email.com


In [24]:
df = pd.read_csv('data/survey_results_public.csv')
df.head(5)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",,,4.0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a jo...","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,,"Developer, desktop or enterprise applications;...",,17,,,,,,,I am actively looking for a job,I've never had a job,,,Financial performance or funding status of the...,"Something else changed (education, award, medi...",,,,,,,,,,,,,,,,,C++;HTML/CSS;Python,C++;HTML/CSS;JavaScript;SQL,,MySQL,Windows,Windows,Django,Django,,,Atom;PyCharm,Windows,I do not use containers,,Useful across many domains and could change ma...,Yes,Yes,Yes,Instagram,Online,Username,2017,Daily or almost daily,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,11-30 minutes,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, and I don't know what those are","Yes, somewhat",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",100 to 499 employees,"Designer;Developer, back-end;Developer, front-...",3.0,22,1,Slightly satisfied,Slightly satisfied,Not at all confident,Not sure,Not sure,"I’m not actively looking, but I am open to new...",1-2 years ago,Interview with people in peer roles,No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,THB,Thai baht,23000.0,Monthly,8820.0,40.0,There's no schedule or spec; I work on what se...,Distracting work environment;Inadequate access...,Less than once per month / Never,Home,Average,No,,"No, but I think we should",Not sure,I have little or no influence,HTML/CSS,Elixir;HTML/CSS,PostgreSQL,PostgreSQL,,,,Other(s):,,,Vim;Visual Studio Code,Linux-based,I do not use containers,,,Yes,Yes,Yes,Reddit,In real life (in person),Username,2011,A few times per week,Find answers to specific questions;Learn how t...,6-10 times per week,They were about the same,,Yes,Less than once per month or monthly,Yes,"No, I've heard of them, but I am not part of a...",Neutral,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,100 to 499 employees,"Developer, full-stack",3.0,16,Less than 1 year,Very satisfied,Slightly satisfied,Very confident,No,Not sure,I am not interested in new job opportunities,Less than a year ago,"Write code by hand (e.g., on a whiteboard);Int...",No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,USD,United States dollar,61000.0,Yearly,61000.0,80.0,There's no schedule or spec; I work on what se...,,Less than once per month / Never,Home,A little below average,No,,"No, but I think we should",Developers typically have the most influence o...,I have little or no influence,C;C++;C#;Python;SQL,C;C#;JavaScript;SQL,MySQL;SQLite,MySQL;SQLite,Linux;Windows,Linux;Windows,,,.NET,.NET,Eclipse;Vim;Visual Studio;Visual Studio Code,Windows,I do not use containers,Not at all,"Useful for decentralized currency (i.e., Bitcoin)",Yes,SIGH,Yes,Reddit,In real life (in person),Username,2014,Daily or almost daily,Find answers to specific questions;Pass the ti...,1-2 times per week,Stack Overflow was much faster,31-60 minutes,Yes,Less than once per month or monthly,Yes,"No, and I don't know what those are","No, not really",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,"10,000 or more employees","Academic researcher;Developer, desktop or ente...",16.0,14,9,Very dissatisfied,Slightly dissatisfied,Somewhat confident,Yes,No,I am not interested in new job opportunities,Less than a year ago,"Write any code;Write code by hand (e.g., on a ...",No,"Industry that I'd be working in;Languages, fra...",I was preparing for a job search,UAH,Ukrainian hryvnia,,,,55.0,There is a schedule and/or spec (made by me or...,Being tasked with non-development work;Inadequ...,A few days each month,Office,A little above average,"Yes, because I see value in code review",,"Yes, it's part of our process",Not sure,I have little or no influence,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,HTML/CSS;Java;JavaScript;SQL;WebAssembly,Couchbase;MongoDB;MySQL;Oracle;PostgreSQL;SQLite,Couchbase;Firebase;MongoDB;MySQL;Oracle;Postgr...,Android;Linux;MacOS;Slack;Windows,Android;Docker;Kubernetes;Linux;Slack,Django;Express;Flask;jQuery;React.js;Spring,Flask;jQuery;React.js;Spring,Cordova;Node.js,Apache Spark;Hadoop;Node.js;React Native,IntelliJ;Notepad++;Vim,Linux-based,"Outside of work, for personal projects",Not at all,,Yes,Also Yes,Yes,Facebook,In real life (in person),Username,I don't remember,Multiple times per day,Find answers to specific questions,More than 10 times per week,Stack Overflow was much faster,,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, I've heard of them, but I am not part of a...","Yes, definitely",Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [25]:
df.shape

(88883, 85)

In [26]:
df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

In [27]:
df['Hobbyist']

0        Yes
1         No
2        Yes
3         No
4        Yes
        ... 
88878    Yes
88879     No
88880     No
88881     No
88882    Yes
Name: Hobbyist, Length: 88883, dtype: object

In [28]:
df_schema = pd.read_csv('data/survey_results_schema.csv')
df_schema

Unnamed: 0,Column,QuestionText
0,Respondent,Randomized respondent ID number (not in order ...
1,MainBranch,Which of the following options best describes ...
2,Hobbyist,Do you code as a hobby?
3,OpenSourcer,How often do you contribute to open source?
4,OpenSource,How do you feel about the quality of open sour...
...,...,...
80,Sexuality,Which of the following do you currently identi...
81,Ethnicity,Which of the following do you identify as? Ple...
82,Dependents,"Do you have any dependents (e.g., children, el..."
83,SurveyLength,How do you feel about the length of the survey...


In [29]:
df_schema.iloc[2]

Column                         Hobbyist
QuestionText    Do you code as a hobby?
Name: 2, dtype: object

#### Counting values

In [30]:
df['Hobbyist'].value_counts()

Hobbyist
Yes    71257
No     17626
Name: count, dtype: int64

#### Getting specific rows and column

In [31]:
df.loc[[0,1,2],'Hobbyist']

0    Yes
1     No
2    Yes
Name: Hobbyist, dtype: object

#### Slicing in pandas is inclusive unlike list, it returns the last parameter of the slice
Here the loc returns 0,1 and 2 rows, it would be 0 and 1 in python list

In [32]:
df.loc[0:2,'Hobbyist']

0    Yes
1     No
2    Yes
Name: Hobbyist, dtype: object

#### Using slicing method to get multiple rows and columns


In [33]:
df.loc[0:2,'Hobbyist':'Employment']

Unnamed: 0,Hobbyist,OpenSourcer,OpenSource,Employment
0,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work"
1,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work"
2,Yes,Never,The quality of OSS and closed source software ...,Employed full-time


# Part 3: How to Set, Reset, and Use Indexes

Indexes are used to indentify the rows, we can use default 0, 1, 2 indexes to identify the rows.
However we can also provide names to the rows by ourself and reset them too.

### Let's see in practice

In [34]:
people = {
    'first': ['prasiddha', 'ram', 'hari'],
    'last': ['pokhrel', 'regmi', 'poudel'],
    'email': ['prasiddha@email.com', 'ram@email.com', 'hari@email.com']
}

df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddha@email.com
1,ram,regmi,ram@email.com
2,hari,poudel,hari@email.com


In [35]:
df.set_index('email', inplace = True) # setting index in the current dataframe with inplace = True

In [36]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
prasiddha@email.com,prasiddha,pokhrel
ram@email.com,ram,regmi
hari@email.com,hari,poudel


In [37]:
df.index

Index(['prasiddha@email.com', 'ram@email.com', 'hari@email.com'], dtype='object', name='email')

In [38]:
# accessing with index name (ram@email.com) and column name (first)
df.loc['ram@email.com', 'first']

'ram'

In [63]:
# now we can't use loc[0] but we can we iloc[0]
# a = df.loc[0] # throws error
# print(a)

b = df.iloc[0]
print(b)

first                     prasiddha
last                        pokhrel
email    prasiddhapokhrel@gmail.com
Name: 0, dtype: object


### Resetting the index

In [40]:
df.reset_index(inplace = True)
df

Unnamed: 0,email,first,last
0,prasiddha@email.com,prasiddha,pokhrel
1,ram@email.com,ram,regmi
2,hari@email.com,hari,poudel


## Practicing in stackoverflow data

In [41]:
df = pd.read_csv('data/survey_results_public.csv', index_col ='Respondent') # setting index when loading data with the column name 'Respondent'
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [42]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [43]:
df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",,,4.0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a jo...","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,,"Developer, desktop or enterprise applications;...",,17,,,,,,,I am actively looking for a job,I've never had a job,,,Financial performance or funding status of the...,"Something else changed (education, award, medi...",,,,,,,,,,,,,,,,,C++;HTML/CSS;Python,C++;HTML/CSS;JavaScript;SQL,,MySQL,Windows,Windows,Django,Django,,,Atom;PyCharm,Windows,I do not use containers,,Useful across many domains and could change ma...,Yes,Yes,Yes,Instagram,Online,Username,2017,Daily or almost daily,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,11-30 minutes,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, and I don't know what those are","Yes, somewhat",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",100 to 499 employees,"Designer;Developer, back-end;Developer, front-...",3.0,22,1,Slightly satisfied,Slightly satisfied,Not at all confident,Not sure,Not sure,"I’m not actively looking, but I am open to new...",1-2 years ago,Interview with people in peer roles,No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,THB,Thai baht,23000.0,Monthly,8820.0,40.0,There's no schedule or spec; I work on what se...,Distracting work environment;Inadequate access...,Less than once per month / Never,Home,Average,No,,"No, but I think we should",Not sure,I have little or no influence,HTML/CSS,Elixir;HTML/CSS,PostgreSQL,PostgreSQL,,,,Other(s):,,,Vim;Visual Studio Code,Linux-based,I do not use containers,,,Yes,Yes,Yes,Reddit,In real life (in person),Username,2011,A few times per week,Find answers to specific questions;Learn how t...,6-10 times per week,They were about the same,,Yes,Less than once per month or monthly,Yes,"No, I've heard of them, but I am not part of a...",Neutral,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,100 to 499 employees,"Developer, full-stack",3.0,16,Less than 1 year,Very satisfied,Slightly satisfied,Very confident,No,Not sure,I am not interested in new job opportunities,Less than a year ago,"Write code by hand (e.g., on a whiteboard);Int...",No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,USD,United States dollar,61000.0,Yearly,61000.0,80.0,There's no schedule or spec; I work on what se...,,Less than once per month / Never,Home,A little below average,No,,"No, but I think we should",Developers typically have the most influence o...,I have little or no influence,C;C++;C#;Python;SQL,C;C#;JavaScript;SQL,MySQL;SQLite,MySQL;SQLite,Linux;Windows,Linux;Windows,,,.NET,.NET,Eclipse;Vim;Visual Studio;Visual Studio Code,Windows,I do not use containers,Not at all,"Useful for decentralized currency (i.e., Bitcoin)",Yes,SIGH,Yes,Reddit,In real life (in person),Username,2014,Daily or almost daily,Find answers to specific questions;Pass the ti...,1-2 times per week,Stack Overflow was much faster,31-60 minutes,Yes,Less than once per month or monthly,Yes,"No, and I don't know what those are","No, not really",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,"10,000 or more employees","Academic researcher;Developer, desktop or ente...",16.0,14,9,Very dissatisfied,Slightly dissatisfied,Somewhat confident,Yes,No,I am not interested in new job opportunities,Less than a year ago,"Write any code;Write code by hand (e.g., on a ...",No,"Industry that I'd be working in;Languages, fra...",I was preparing for a job search,UAH,Ukrainian hryvnia,,,,55.0,There is a schedule and/or spec (made by me or...,Being tasked with non-development work;Inadequ...,A few days each month,Office,A little above average,"Yes, because I see value in code review",,"Yes, it's part of our process",Not sure,I have little or no influence,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,HTML/CSS;Java;JavaScript;SQL;WebAssembly,Couchbase;MongoDB;MySQL;Oracle;PostgreSQL;SQLite,Couchbase;Firebase;MongoDB;MySQL;Oracle;Postgr...,Android;Linux;MacOS;Slack;Windows,Android;Docker;Kubernetes;Linux;Slack,Django;Express;Flask;jQuery;React.js;Spring,Flask;jQuery;React.js;Spring,Cordova;Node.js,Apache Spark;Hadoop;Node.js;React Native,IntelliJ;Notepad++;Vim,Linux-based,"Outside of work, for personal projects",Not at all,,Yes,Also Yes,Yes,Facebook,In real life (in person),Username,I don't remember,Multiple times per day,Find answers to specific questions,More than 10 times per week,Stack Overflow was much faster,,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, I've heard of them, but I am not part of a...","Yes, definitely",Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [73]:
schema_df.shape

(85, 1)

In [77]:
schema_df.head()

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
OpenSourcer,How often do you contribute to open source?
OpenSource,How do you feel about the quality of open sour...


In [44]:
df.loc[1] # accessing row by index name

MainBranch                           I am a student who is learning to code
Hobbyist                                                                Yes
OpenSourcer                                                           Never
OpenSource                The quality of OSS and closed source software ...
Employment                           Not employed, and not looking for work
Country                                                      United Kingdom
Student                                                                  No
EdLevel                                           Primary/elementary school
UndergradMajor                                                          NaN
EduOther                  Taught yourself a new language, framework, or ...
OrgSize                                                                 NaN
DevType                                                                 NaN
YearsCode                                                                 4
Age1stCode  

In [79]:
schema_df['QuestionText']

Column
Respondent                Randomized respondent ID number (not in order ...
MainBranch                Which of the following options best describes ...
Hobbyist                                            Do you code as a hobby?
OpenSourcer                     How often do you contribute to open source?
OpenSource                How do you feel about the quality of open sour...
Employment                Which of the following best describes your cur...
Country                           In which country do you currently reside?
Student                   Are you currently enrolled in a formal, degree...
EdLevel                   Which of the following best describes the high...
UndergradMajor            What was your main or most important field of ...
EduOther                  Which of the following types of non-degree edu...
OrgSize                   Approximately how many people are employed by ...
DevType                   Which of the following describe you? Please se...
Years

In [66]:
# passing row index as 'MgrIdiot' and column name we need to access as 'QuestionText'
schema_df.loc['MgrIdiot', 'QuestionText']

'How confident are you that your manager knows what they’re doing?'

### Sorting

In [80]:
schema_df.sort_index(ascending = False, inplace = True)

In [81]:
schema_df

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
YearsCodePro,How many years have you coded professionally (...
YearsCode,"Including any education, how many years have y..."
WorkWeekHrs,"On average, how many hours per week do you work?"
WorkRemote,How often do you work remotely?
WorkPlan,How structured or planned is your work?
WorkLoc,Where would you prefer to work?
WorkChallenge,"Of these options, what are your greatest chall..."
WelcomeChange,"Compared to last year, how welcome do you feel..."
WebFrameWorkedWith,Which of the following web frameworks have you...
WebFrameDesireNextYear,Which of the following web frameworks have you...


# Part 4: Filtering data

In Datascience we always start with filtering out the data. So it is one of the important skill to learn. Let's see in practice

### Let's see in practice

In [84]:
people = {
    'first': ['prasiddha', 'ram', 'hari'],
    'last': ['pokhrel', 'regmi', 'regmi'],
    'email': ['prasiddha@email.com', 'ramregmi@email.com', 'hariregmi@email.com']
}

df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddha@email.com
1,ram,regmi,ramregmi@email.com
2,hari,regmi,hariregmi@email.com


### Using df to filter data
But loc is more preferred because we can pass column name to be filtered

In [85]:
# filtering people with last name = 'regmi'
filt = df['last'] == 'regmi'
filt

0    False
1     True
2     True
Name: last, dtype: bool

In [89]:
df[filt]
df[pd.Series([False, True, True], index = [0,1,2])] # both lines provides same output

Unnamed: 0,first,last,email
1,ram,regmi,ramregmi@email.com
2,hari,regmi,hariregmi@email.com


### Using loc to filter data
Because it gives convenience of getting specific columns also.

In [None]:
# filtering people with last name 'regmi' and getting their email
filt = df['last'] == 'regmi'
df.loc[filt, 'email'] # here 'email' is the dataframe column

### Using conditional operators '&' and '|' to filter out data

1. Filtering people which have first name ram and last name regmi
2. Filtering people which have first name prasiddha and last name regmi

In [None]:
filt = (df['first'] == 'ram') & (df['last'] == 'regmi')
df.loc[filt, ['first', 'last', 'email']]

In [90]:
filt = (df['first'] == 'prasiddha') | (df['last'] == 'regmi')
df.loc[filt]

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddha@email.com
1,ram,regmi,ramregmi@email.com
2,hari,regmi,hariregmi@email.com


### Using '~' tilde to filter opposite value of the filter

In [91]:
# filtering people which doesn't have last 'regmi'
filt = df['last'] == 'regmi'
df.loc[~filt]

Unnamed: 0,first,last,email
0,prasiddha,pokhrel,prasiddha@email.com


## Practicing filter in stackoverflow data

In [92]:
df = pd.read_csv('data/survey_results_public.csv', index_col ='Respondent') # setting index when loading data with the column name 'Respondent'
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [93]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [94]:
df.head(2)

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",,,4.0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a jo...","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,,"Developer, desktop or enterprise applications;...",,17,,,,,,,I am actively looking for a job,I've never had a job,,,Financial performance or funding status of the...,"Something else changed (education, award, medi...",,,,,,,,,,,,,,,,,C++;HTML/CSS;Python,C++;HTML/CSS;JavaScript;SQL,,MySQL,Windows,Windows,Django,Django,,,Atom;PyCharm,Windows,I do not use containers,,Useful across many domains and could change ma...,Yes,Yes,Yes,Instagram,Online,Username,2017,Daily or almost daily,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,11-30 minutes,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, and I don't know what those are","Yes, somewhat",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult


In [None]:
schema_df

In [95]:
schema_df.loc['ConvertedComp', 'QuestionText']

'Salary converted to annual USD salaries using the exchange rate on 2019-02-01, assuming 12 working months and 50 working weeks.'

In [96]:
# filtering salary over 40000
high_salary = df['ConvertedComp'] > 40000
df.loc[high_salary,['Country', 'LanguageWorkedWith', 'ConvertedComp']]

Unnamed: 0_level_0,Country,LanguageWorkedWith,ConvertedComp
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,United States,C;C++;C#;Python;SQL,61000.0
6,Canada,Java;R;SQL,366420.0
9,New Zealand,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...,95179.0
13,United States,Bash/Shell/PowerShell;HTML/CSS;JavaScript;PHP;...,90000.0
14,Germany,C++,57060.0
...,...,...,...
88877,United States,Bash/Shell/PowerShell;C;Clojure;HTML/CSS;Java;...,2000000.0
88878,United States,HTML/CSS;JavaScript;Scala;TypeScript,130000.0
88879,Finland,Bash/Shell/PowerShell;C++;Python,82488.0
88881,Austria,Bash/Shell/PowerShell;Go;HTML/CSS;Java;JavaScr...,68745.0


### Filtering data with country names

In [None]:
countries = ['Nepal', 'United States', 'India', 'Canada', 'Germany']
filt = df['Country'].isin(countries)
df.loc[filt, 'Country']

### Get people who knew 'Python' as programming language

Using str method in pandas

In [101]:
filt = df['LanguageWorkedWith'].str.contains('Python', na = False)
filt
df.loc[filt,'LanguageWorkedWith']

Respondent
1                          HTML/CSS;Java;JavaScript;Python
2                                      C++;HTML/CSS;Python
4                                      C;C++;C#;Python;SQL
5              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
8        Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
                               ...                        
84539    Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
85738      Bash/Shell/PowerShell;C++;Python;Ruby;Other(s):
86566      Bash/Shell/PowerShell;HTML/CSS;Python;Other(s):
87739             C;C++;HTML/CSS;JavaScript;PHP;Python;SQL
88212                           HTML/CSS;JavaScript;Python
Name: LanguageWorkedWith, Length: 36443, dtype: object

In [99]:
df.loc[88816,'LanguageWorkedWith']

nan

In [107]:
# filtering respondent from Nepal
df['Country'].value_counts().get('Nepal',0)

237

In [108]:
# filtering who have salary higher than 12000 and from Nepal
filt = (df['Country'] == 'Nepal') & (df['ConvertedComp'] > 12000)
filt.value_counts().get(True,0)

8

In [109]:
df.loc[filt, ['LanguageWorkedWith','ConvertedComp']]

Unnamed: 0_level_0,LanguageWorkedWith,ConvertedComp
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
5993,HTML/CSS;JavaScript;PHP,300000.0
25261,C#;HTML/CSS;Java;JavaScript;PHP;Python;SQL,21096.0
39526,Assembly;C;C++;C#;HTML/CSS;Java;JavaScript;PHP...,100000.0
43075,JavaScript;PHP;SQL,30000.0
61673,C#;HTML/CSS;JavaScript;SQL;TypeScript,12660.0
77938,Bash/Shell/PowerShell;C++;HTML/CSS;Java;JavaSc...,12660.0
79024,HTML/CSS;Java;JavaScript;SQL;Other(s):,15818.0
85446,Java;JavaScript;Python;SQL,21096.0


### Using boolean masking to get first 5 rows

In [None]:
mask = [True] * 5 + [False] * (len(df) - 5)

In [None]:
len(mask)

In [None]:
df.loc[mask]

## Part 5: Updating Rows and Columns - Modifying Data within DataFrames
We will learn to how to update the data for our rows and columns and in next chapter we will learn how to remove rows and columns from our dataframes

What we will learn:

    - Changing column names at once and using rename method and list comprehension
    - String method replace, upper, lower
    - Update values of rows (series) by filtering using 'loc' and by using lambda function
    - Update values of dataframe using applymap
    - apply, map, applymap and replace

In [None]:
people = {
    'first': ['prasiddha', 'ram', 'hari'],
    'last': ['pokhrel', 'regmi', 'regmi'],
    'email': ['Prasiddha@email.com', 'RamRegmi@email.com', 'HariRegmi@email.com']
}

df = pd.DataFrame(people)
df

### Changing the column names
There are two approach first approach isn't preferred, second approach is preferred

In [None]:
df.columns

#### First approach to rename column names: use when all the columns name needed to change

In [None]:
# first approach: use when all the columns name needed to change
df.columns = ['first name', 'last name', 'email']

In [None]:
df

### Using list comprehension and str.upper() to change all the column names to uppercase

In [None]:
# df.columns = [x.upper() for x in df.columns]
# df
df.columns = df.columns.str.upper()
df

### Using str.replace() to update column name

In [None]:
df.columns = df.columns.str.replace(" ", "_")
df.columns = [x.lower() for x in df.columns]
df

#### Second approach to rename column names: use when need to change specific columns

In [None]:
df.rename(columns={'first_name':'first', 'last_name':'last'}, inplace = True)
df

### Changing the value of single and multiple row

In [None]:
# changing value of a single row
df.loc[2] = ['hari','pokhrel', 'HariPokhrel@email.com']
df

In [None]:
# changing last and email (multiple values)
df.loc[2, ['last', 'email']] = ['regmi', 'HariRegmi@email.com']
df

In [None]:
# changing last (single values)
df.loc[2, 'last'] = 'pokhrel'
df

### Using 'at' instead of 'loc'

In [None]:
df.at[2, 'last'] = 'regmi'
df

In [None]:
filt = (df['email'] == 'HariRegmi@email.com')
df[filt]['last'] = 'poudel' # gives error, instead use loc to update the value
df

In [None]:
df.loc[filt, 'last'] = 'poudel'
df

In [None]:
df['email'] = df['email'].str.lower()
df

### apply, map, applymap, replace methods
1. apply: apply is used to calling a function on our values apply can work on either on dataframe or in series object. Behaviour might be different for each of those objects (dataframe and series)
    - Running apply on a series applies a function to every elements in the series
    - Running apply on a dataframe applies a function to every series in the dataframe

What if we can apply a function to every element in dataframe. THAT'S WHERE APPLYMAP USED FOR

2. applymap: applymap only works for dataframe.
    - It is used to operate for each elements of the dataframe

3. map: map only works for series object
    - It is used for substituting each value in a series with another value

4. replace: 
    - It is used to work with specific elements of the dataframe/series

### apply

In [None]:
df['email'].apply(len)

#### apply function in our custom function and lambda function

In [None]:
def update_email(email):
    return email.upper()

In [None]:
df['email'].apply(update_email)

In [None]:
df['email'] = df['email'].apply(update_email)
df

In [None]:
df['email'] = df['email'].apply(lambda x: x.lower())
df

In [None]:
df['email'].apply(len) # in default it applies through all the rows.

#### 'apply' function in DataFrame object

In [None]:
df.apply(len) # in default it applies through all the rows (series object). it is giving out the no of rows

In [None]:
df.apply(len, axis = 'columns') # it is counting the no. of columns

#### applying min function in Series object

In [None]:
df.apply(pd.Series.min)

In [None]:
df.apply(lambda x: x.min())

### applymap

In [None]:
df.applymap(len)

In [None]:
df.applymap(str.lower)

### map

In [None]:
df['first'].map({'prasiddha':'prasidh', 'ram':'ramu'})

### replace

In [None]:
df['first'] = df['first'].replace({'prasiddha':'prasidh', 'ram':'ramu'})
df

## Practicing apply, applymap, map, replace in stackoverflow data

In [None]:
df = pd.read_csv('data/survey_results_public.csv', index_col ='Respondent') # setting index when loading data with the column name 'Respondent'
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head(1)

In [None]:
df.rename(columns={'ConvertedComp':'SalaryUSD'}, inplace = True)
df

In [None]:
# using map to change hobbyist values
df['Hobbyist'] = df['Hobbyist'].map({'Yes': True, 'No': False})
df.head(2)

In [None]:
# using replace to change hobbyist values
df['Hobbyist'] = df['Hobbyist'].replace({True: 'Yes', False: 'No'})
df.head(2)

## Part 6: Add Remove Rows and Columns From DataFrames

We will be learning:

        - to add and remove rows and columns from dataframes
        - combine multiple columns into one

Methods learned:
    
    - drop:
        - drop columns: df.drop(columns = ['first','last'], inplace = True)
        - drop rows: df.drop(index=[1,2])
    
    - str.split(' '):
        - to split the values according to the passed parameter into the split method, in default it splits by empty space

    - pd.concat:
        - to add rows into the dataframe
        - df = pd.concat([df, df3], ignore_index=True)
 

append method is deprecated in Pandas 2.0.0, so concat is used

In [None]:
people = {
    'first': ['prasiddha', 'ram', 'hari'],
    'last': ['pokhrel', 'regmi', 'regmi'],
    'email': ['Prasiddha@email.com', 'RamRegmi@email.com', 'HariRegmi@email.com']
}

df = pd.DataFrame(people)
df

### Combine columns

In [None]:
df['first'] + ' ' + df['last']

In [None]:
df['full_name'] = df['first'] + ' ' + df['last']

In [None]:
df

### Removing columns

In [None]:
df.drop(columns = ['first','last'], inplace = True)
df

### Adding multiple columns from a single column by separating data

In [None]:
df['full_name'].str.split(' ', expand = True)

In [None]:
df[['first','last']] = df['full_name'].str.split(' ', expand = True)
df

### Adding and removing rows

1. We add single row into dataframe
2. Combine two dataframes together into a single dataframe by appending the rows of one to another  

#### Adding single row by using pd.concat([df,df2]) method

In [None]:
df2 = pd.DataFrame({'first':['Arya']})

In [None]:
df = pd.concat([df, df2], ignore_index = True, axis = 0)
# df.drop(0, axis = 1, inplace = True)
# df.drop(df.index[3:11], axis = 0, inplace = True)
df

In [None]:
if hasattr(pd.DataFrame, 'append'):
    print("The 'append' method is available in Pandas.")
else:
    print("The 'append' method is not available in Pandas.")

#### Adding multiple rows by using pd.concat() method

In [None]:
people = {
    'first': ['shyam', 'ishwor'],
    'last': ['kafle', 'gurung'],
    'email': ['ShyamKafle@email.com', 'IshworGurung@email.com']
}

df3 = pd.DataFrame(people)
df3

In [None]:
df = pd.concat([df, df3], ignore_index=True)
df

### Dropping rows

In [None]:
df.drop(index = 3, inplace = True)

In [None]:
# changing last of the row 5
last_name = df.loc[5,'last']
df.loc[5,'last'] = 'kafle'
df

#### Filtering out row by last name and dropping them
First we get filter the last name to be deleted, in this case we filter 'last' with 'kafle'

Then we get their index and pass it to the drop(index=[])

In [None]:
filt = df['last'] == 'kafle'
df[filt].index

In [None]:
df.drop(index=df[filt].index)

In [None]:
df

## Part 7: Sorting Data

1. df.sort_values(by='last', ascending=True)
2. df.sort_values(by=['first','last'], ascending=[True, False])
3. df['last'].sort_values()
4. df['salary'].nlargest(10) # sorting the 10 largest salary
5. df['salary'].nsmallest(10) # sorting the 10 smallest salary

In [None]:
people = {
    'first': ['prasiddha', 'hari', 'ram', 'amrit'],
    'last': ['pokhrel', 'regmi', 'regmi', 'regmi'],
    'email': ['Prasiddha@email.com','HariRegmi@email.com', 'RamRegmi@email.com', 'a@email.com']
}

df = pd.DataFrame(people)
df

In [None]:
df.sort_values(by='last', ascending = False)

sorting last name in descending order,if there are two last name similar then sort by first name

In [None]:
df.sort_values(by=['last','first'], ascending = False)

sorting last name in descending order and first name in ascending order by passing boolean value in a list, it will sort the first name in ascending order if there are same multiple value in last name, for example, ram hari and amrit have same 'regmi' as last name, so it will sort these three names in ascending order as they have 'regmi' as same value

In [None]:
df.sort_values(by=['last','first'], ascending=[False, True])

sort_index() sort values by index

In [None]:
df.sort_index()

sort_values() in series data (data of a single column)

In [None]:
df['last'].sort_values()

### Practicing sorting in Stackoverflow data

In [None]:
df = pd.read_csv('data/survey_results_public.csv', index_col ='Respondent') # setting index when loading data with the column name 'Respondent'
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head(1)

In [None]:
df.sort_values(by=['Country'], inplace = True)

In [None]:
df[['Country', 'ConvertedComp']]

### Sorting countries in ascending order and salary in descending order

In [None]:
df.sort_values(by=['Country','ConvertedComp'], ascending=[True, False], inplace=True)

In [None]:
df[['Country','ConvertedComp']].head(50)

### Getting largest or smallest value from the dataframe
Getting 10 highest salary

In [None]:
df[['ConvertedComp']].nlargest(10)

In [None]:
df.nlargest(10, 'ConvertedComp')

In [None]:
df.nsmallest(10, 'ConvertedComp')

## Part 8: Grouping and Aggregation - Analyzing and Exploring Data.

#### Aggregation
    - Combining multiple pieces of data into a single result. Using mean, median or mode are aggregate functions because they take multiple values and give mead, median or mode of those values.


1. df['ConvertedComp'].median()
2. df['ConvertedComp'].count() # counts the values except NaN value
2. df['Hobbyist'].value_counts() # counts the occurance of each value,like if there are 10 ones, 11 twos and 12 three in the value, it provides 1: 10, 2: 11, 3: 12

#### Grouping popular social media by countries
The group by function works in three steps

splits the objects
applies a function
combines the result

#### advance data analysis 
    using group by, apply and loc to 
        - find the number of people who are using python according to the country
        - calculate the percentage of respondent using python
        - concat method
        - agg method


In [None]:
df = pd.read_csv('data/survey_results_public.csv', index_col ='Respondent') # setting index when loading data with the column name 'Respondent'
schema_df = pd.read_csv('data/survey_results_schema.csv', index_col = 'Column')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head(3)

#### Typical salary of developer of the survey

In [None]:
df['ConvertedComp'].head(15)

In [None]:
df['ConvertedComp'].median()

In [None]:
numeric_df = df.select_dtypes(include=['number']) # selecting the columns with numeric value and calculating the median
numeric_df.median()

In [None]:
df.describe()

In [None]:
df['ConvertedComp'].value_counts()

In [None]:
df['Hobbyist'].value_counts()

In [None]:
# counting the popularity of the social media by value_counts() function
df['SocialMedia'].value_counts() # reddit was the most popular

In [None]:
df['SocialMedia'].value_counts(normalize=True)

## Grouping popular social media by countries
The group by function works in three steps
1. splits the objects
2. applies a function
3. combines the result

In [None]:
df['Country'].value_counts()

In [None]:
country_grp = df.groupby(['Country']) # returns a pandas GroupbyDataFrame object

In [None]:
country_grp.get_group('Nepal').head(1)

In [None]:
country_grp.get_group('Nepal')['ConvertedComp'].median() # getting the median salary of the Nepal

#### We can also use loc to filter the country by it's name

In [None]:
filt = df['Country'] == 'Nepal'
df.loc[filt]['SocialMedia'].value_counts()

#### We have all the countries grouped by country, now we can apply a function to them

In [None]:
# getting the popular social media by groupby object and using value_counts on 'SocialMedia' column
country_grp['SocialMedia'].value_counts().loc['United Kingdom']

In [None]:
country_grp['SocialMedia'].value_counts().loc['China'].head(3)

In [None]:
country_grp['SocialMedia'].value_counts().loc['Russian Federation'].head(3)

In [None]:
country_grp['SocialMedia'].value_counts(normalize=True).loc['Russian Federation'].head(3)

#### Getting the median salary for the countries in grouby object

In [None]:
country_grp['ConvertedComp'].median()

In [None]:
country_grp['ConvertedComp'].median().loc['Nepal']

### Agg method to calculate multiple aggregrate functions like mean, median

In [None]:
country_grp['ConvertedComp'].agg(['median','mean'])

### Getting the number of people who are using python

In [None]:
filt = df['Country'] == 'Nepal'
df.loc[filt]['LanguageWorkedWith'].str.contains('Python').sum()

In [None]:
country_grp.get_group('Nepal').head(1)

In [None]:
country_grp['LanguageWorkedWith'].str.contains('Python').sum()

In [None]:
# lambda function recap as we are working with lambda function
check = lambda x, item: item in x
check([1,2], 1)

 We can't directly use the .str method on a SeriesGroupBy object, but we can apply it to individual Series within the groups
Grouping: When you use the groupby method on a DataFrame, it divides the data into separate groups based on the values in a specified column (or multiple columns). Each group contains rows that share a common value or set of values in the grouping column(s).

SeriesGroupBy Object: The result of a groupby operation is a SeriesGroupBy object. This object represents the grouped data, but it doesn't allow you to perform operations directly on the groups as a whole.

Applying to Individual Series: To work with the data within each group, you can use the .apply method in combination with a lambda function or another custom function. This allows you to perform operations on each individual Series (column) within the groups separately.

In [None]:
country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python'))

# in the below result there is a series for individual country, which makes it multi index series and apply function 
# works on each country's grouped data

In [None]:
country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())

In [None]:
country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum()).loc['Nepal']

In [None]:
country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())

 ### Exercise: find the percentage of respondent who knows python for each country

In [None]:
country_respondent = df['Country'].value_counts()
country_respondent

In [None]:
country_grp = df.groupby('Country')
country_grp

In [None]:
respondent_know_python = country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
respondent_know_python

In [None]:
python_df = pd.concat([country_respondent, respondent_know_python], axis = 'columns', sort = False)
python_df

In [None]:
python_df.rename(columns={'count':'NumRespondents', 'LanguageWorkedWith':'NumKnowsPython'}, inplace = True)

In [None]:
python_df

In [None]:
python_df['PctKnowsPython'] = (python_df['NumKnowsPython']/python_df['NumRespondents'])*100

In [None]:
python_df

In [None]:
python_df.sort_values(by='PctKnowsPython', ascending = False, inplace = True)

In [None]:
python_df

In [None]:
python_df.loc['Japan']