# DataFrame and Series Basics

In [1]:
import pandas as pd

In [2]:
person = {
    "first": "Phil",
    "last": "Lembo",
    "email": "phil.lembo@gmail.com"
}

In [3]:
people = {
    "first": ["Phil"],
    "last": ["Lembo"],
    "email": ["phil.lembo@gmail.com"]
}

In [4]:
people = {
    "first": ["Phil", "Jane", "Rob"],
    "last": ["Lembo", "Doe", "Roe"],
    "email": ["phil.lembo@gmail.com", "janedoe@email.com", "robroe@email.com"]
}

In [5]:
people["email"]                  #lists all of the people's email as row

['phil.lembo@gmail.com', 'janedoe@email.com', 'robroe@email.com']

In [6]:
df = pd.DataFrame(people)             #makes previous list to a dataframe
df

Unnamed: 0,first,last,email
0,Phil,Lembo,phil.lembo@gmail.com
1,Jane,Doe,janedoe@email.com
2,Rob,Roe,robroe@email.com


In [None]:
# There are two major data types in pandas: DataFrames and Series, and df here has the DataFrame data type.

In [7]:
type(df)                  #tells data type

pandas.core.frame.DataFrame

In [9]:
df['email']                 #lists all of the table's email as series

0    phil.lembo@gmail.com
1       janedoe@email.com
2        robroe@email.com
Name: email, dtype: object

In [10]:
type(df['email'])

pandas.core.series.Series

In [None]:
#This is an alternative way of calling the email column, but using it risks it being confused with methods.:

In [12]:
df.email

0    phil.lembo@gmail.com
1       janedoe@email.com
2        robroe@email.com
Name: email, dtype: object

In [13]:
df[['last', 'email']]                        #Pass a list of selected columns

Unnamed: 0,last,email
0,Lembo,phil.lembo@gmail.com
1,Doe,janedoe@email.com
2,Roe,robroe@email.com


In [14]:
type(df[['last', 'email']])                      #type of the coloumns

pandas.core.frame.DataFrame

In [15]:
df.columns              #shows coloumns in dataframe      

Index(['first', 'last', 'email'], dtype='object')

In [None]:
#To get rows, use loc and iloc indexers.

#iloc = "integer location", seach by numeric index

#loc = search by label


In [16]:
df.iloc[0]

first                    Phil
last                    Lembo
email    phil.lembo@gmail.com
Name: 0, dtype: object

In [17]:
type(df.iloc[0])

pandas.core.series.Series

In [11]:
df.iloc[[0, 1]]                 #Grab first two rows 

Unnamed: 0,first,last,email
0,Phil,Lembo,phil.lembo@gmail.com
1,Jane,Doe,janedoe@email.com


In [19]:
df.iloc[[0, 1], 2]           #rows 1-2 of coloumn 3

0    phil.lembo@gmail.com
1       janedoe@email.com
Name: email, dtype: object

In [20]:
df.loc[0]                   

first                    Phil
last                    Lembo
email    phil.lembo@gmail.com
Name: 0, dtype: object

In [21]:
df.loc[[0, 1]]            #Grab first two rows 

Unnamed: 0,first,last,email
0,Phil,Lembo,phil.lembo@gmail.com
1,Jane,Doe,janedoe@email.com




But now we can use a column label:

In [22]:
df.loc[[0, 1], 'email']

0    phil.lembo@gmail.com
1       janedoe@email.com
Name: email, dtype: object

... or a list of labels!

In [23]:
df.loc[[0, 1], ['email', 'last']]

Unnamed: 0,email,last
0,phil.lembo@gmail.com,Lembo
1,janedoe@email.com,Doe


In [12]:
res_df = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

Basic characteristics of dataframe (number of rows, number of columns).

In [25]:
res_df.shape

(88883, 85)

List all the column labels.

In [26]:
res_df.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

All responses in the Hobbyist column.

In [27]:
res_df['Hobbyist']

0        Yes
1         No
2        Yes
3         No
4        Yes
        ... 
88878    Yes
88879     No
88880     No
88881     No
88882    Yes
Name: Hobbyist, Length: 88883, dtype: object

Number of each response to question ('Yeses' and 'Nos').

In [28]:
res_df['Hobbyist'].value_counts()

Yes    71257
No     17626
Name: Hobbyist, dtype: int64

All responses from first row.

In [29]:
res_df.loc[0]

Respondent                                                                1
MainBranch                           I am a student who is learning to code
Hobbyist                                                                Yes
OpenSourcer                                                           Never
OpenSource                The quality of OSS and closed source software ...
Employment                           Not employed, and not looking for work
Country                                                      United Kingdom
Student                                                                  No
EdLevel                                           Primary/elementary school
UndergradMajor                                                          NaN
EduOther                  Taught yourself a new language, framework, or ...
OrgSize                                                                 NaN
DevType                                                                 NaN
YearsCode   

Get responses to Hobbyist question in first three rows by passing in a list of rows.

In [30]:
res_df.loc[[0, 1, 2], 'Hobbyist']

0    Yes
1     No
2    Yes
Name: Hobbyist, dtype: object

Can also use slice notation to retrieve a range of rows.

In [34]:
res_df.loc[0:2]

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,OrgSize,DevType,YearsCode,Age1stCode,YearsCodePro,CareerSat,JobSat,MgrIdiot,MgrMoney,MgrWant,JobSeek,LastHireDate,LastInt,FizzBuzz,JobFactors,ResumeUpdate,CurrencySymbol,CurrencyDesc,CompTotal,CompFreq,ConvertedComp,WorkWeekHrs,WorkPlan,WorkChallenge,WorkRemote,WorkLoc,ImpSyn,CodeRev,CodeRevHrs,UnitTests,PurchaseHow,PurchaseWhat,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,WebFrameWorkedWith,WebFrameDesireNextYear,MiscTechWorkedWith,MiscTechDesireNextYear,DevEnviron,OpSys,Containers,BlockchainOrg,BlockchainIs,BetterLife,ITperson,OffOn,SocialMedia,Extraversion,ScreenName,SOVisit1st,SOVisitFreq,SOVisitTo,SOFindAnswer,SOTimeSaved,SOHowMuchTime,SOAccount,SOPartFreq,SOJobs,EntTeams,SOComm,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",,,4.0,10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,HTML/CSS;Java;JavaScript;Python,C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL,SQLite,MySQL,MacOS;Windows,Android;Arduino;Windows,Django;Flask,Flask;jQuery,Node.js,Node.js,IntelliJ;Notepad++;PyCharm,Windows,I do not use containers,,,Yes,"Fortunately, someone else has that title",Yes,Twitter,Online,Username,2017,A few times per month or weekly,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,31-60 minutes,No,,"No, I didn't know that Stack Overflow had a jo...","No, and I don't know what those are",Neutral,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,,"Developer, desktop or enterprise applications;...",,17,,,,,,,I am actively looking for a job,I've never had a job,,,Financial performance or funding status of the...,"Something else changed (education, award, medi...",,,,,,,,,,,,,,,,,C++;HTML/CSS;Python,C++;HTML/CSS;JavaScript;SQL,,MySQL,Windows,Windows,Django,Django,,,Atom;PyCharm,Windows,I do not use containers,,Useful across many domains and could change ma...,Yes,Yes,Yes,Instagram,Online,Username,2017,Daily or almost daily,Find answers to specific questions;Learn how t...,3-5 times per week,Stack Overflow was much faster,11-30 minutes,Yes,A few times per month or weekly,"No, I knew that Stack Overflow had a job board...","No, and I don't know what those are","Yes, somewhat",Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",100 to 499 employees,"Designer;Developer, back-end;Developer, front-...",3.0,22,1.0,Slightly satisfied,Slightly satisfied,Not at all confident,Not sure,Not sure,"I’m not actively looking, but I am open to new...",1-2 years ago,Interview with people in peer roles,No,"Languages, frameworks, and other technologies ...",I was preparing for a job search,THB,Thai baht,23000.0,Monthly,8820.0,40.0,There's no schedule or spec; I work on what se...,Distracting work environment;Inadequate access...,Less than once per month / Never,Home,Average,No,,"No, but I think we should",Not sure,I have little or no influence,HTML/CSS,Elixir;HTML/CSS,PostgreSQL,PostgreSQL,,,,Other(s):,,,Vim;Visual Studio Code,Linux-based,I do not use containers,,,Yes,Yes,Yes,Reddit,In real life (in person),Username,2011,A few times per week,Find answers to specific questions;Learn how t...,6-10 times per week,They were about the same,,Yes,Less than once per month or monthly,Yes,"No, I've heard of them, but I am not part of a...",Neutral,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult


How the first row responded to the Hobbyist question.

In [38]:
res_df.loc[0, 'Hobbyist']

'Yes'

Note we can drop the brackets when selecting rows _and_ a column together.

Get the responses of the first three rows to the Hobbyist question by passing in a slice of rows and the column label.

In [37]:
res_df.loc[0:2, 'Hobbyist']

0    Yes
1     No
2    Yes
Name: Hobbyist, dtype: object

When selecting a slice of rows and columns, drop the brackets to avoid a syntax error.

Retrieve a slice of rows together with a slice of columns.

In [39]:
res_df.loc[0:2, 'Hobbyist':'Employment']

Unnamed: 0,Hobbyist,OpenSourcer,OpenSource,Employment
0,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work"
1,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work"
2,Yes,Never,The quality of OSS and closed source software ...,Employed full-time


Note slicing is inclusive to avoid driving users insane.