In [None]:
# pandas is python library used to access, clean and manipulate data.

<br><br>

## *IMPORTING*

In [1]:
import pandas as pd
import numpy as np

<br><br>

## *CREATING*

In [2]:
# within pandas, we store/handle data in the form of DataFrame

'''
----------------
    DataFrame
----------------

-> DataFrame is 2D labeled data structure with column in which we can store data of any type (string, integer, float etc etc)
'''

# NOTE : Within DataFrame we have rows and columns
# NOTE : Each column in DataFrame is a series and each row in DataFrame is a index

character_df = pd.DataFrame(
    {
        "Name" : [
            "Levi Ackerman",
            "Anne Leonhart",
            "Eren Yeager",
            "Sasha Blause",
            "Riener Braun",
            "Jean Kirstein"
            ],
        "Age" : [32, 23, 19, 20, 23, 20],
        "Sex" : ["Male", "Female", "Male", "Female", "Male", "Male"],
    }
)


# NOTE : While using python dictionary of list, keys of dicitonary will become header of column and list will become that columns data

<br><br>

## *CALLING*

In [3]:
# this is how we call dataframe within ipynb file(without print)
character_df

Unnamed: 0,Name,Age,Sex
0,Levi Ackerman,32,Male
1,Anne Leonhart,23,Female
2,Eren Yeager,19,Male
3,Sasha Blause,20,Female
4,Riener Braun,23,Male
5,Jean Kirstein,20,Male


<br><br>

### *CALLING ONE COLUMN (Series)*

In [4]:
# we can get any column by there header(column name) the same we get data of any key from dicitonary
character_df["Name"]

# NOTE : The above code returns us what we call "Series", with data and their indexes accordingly
# NOTE : We can see the name and datatype(dtype) of pandas column at the bottom of output
# NOTE : Here the series has string data which pandas represents as object.

0    Levi Ackerman
1    Anne Leonhart
2      Eren Yeager
3     Sasha Blause
4     Riener Braun
5    Jean Kirstein
Name: Name, dtype: object

In [5]:
# series are just like dictionaries in python, on accessing element with label via indexing it will give data accordingly
# on not finding label via indexing, it will raise exception
age = character_df["Age"]
age[2]      # 18 
# age[7]    # will raise error


# on not finding label via get() method, it will return None or specified default
age = character_df["Age"]
age.get(2)                      # 18
# age.get(7)                    # None
# age.get(7, "Not defined")     # "Not defined"

19

<br><br>

### *ATTRIBUTES AND METHODS FOR DATAFRAME/SERIES*

In [6]:
# dtype attribute return series containing datatypes of each column in DataFrame/Series
character_df.dtypes

# NOTE : integer -> (int64), float -> (float64), string -> (object)

Name    object
Age      int64
Sex     object
dtype: object

In [7]:
# shape attribute return tuple (axis length of each axis) in DataFrame/Series
character_df.shape

(6, 3)

In [8]:
# array attribute is used to return numpy array version of pandas series
character_df["Age"].array

# NOTE : This attribute is mostly used to perform operation on pandas Series by converting them into numpy array

<PandasArray>
[32, 23, 19, 20, 23, 20]
Length: 6, dtype: int64

In [9]:
# info() will return info of each column in DataFrame
character_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    6 non-null      object
 1   Age     6 non-null      int64 
 2   Sex     6 non-null      object
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes


In [None]:
# head() will return the required top-most() rows of DataFrame (numbers of rows want will be passed as argument)
character_df.head(2)


In [None]:
# tail() will return the required bottom-most() rows of DataFrame (numbers of rows want will be passed as argument)
character_df.tail(2)

In [None]:
# we can several operation on series with its method
character_df["Age"].max()
# part before max() method in code returns series on which we have performed max() method which will get us maximum(largest) value from the series

In [None]:
# describe() method will give us all the statastical operation
character_df["Age"].describe()

# NOTE : describe method only works with numerical data

<br><br>

### *CALLING SPECIFIC COLUMNS (SUB-DATAFRAME)*

In [None]:
# to call specific column we should pass list of column names in square brackets like beloq : 
# this will return an DataFram containing only columns which are specified
character_df[["Name", "Age"]]

In [None]:
# to select specific rows and columns we have to call them with their labels
character_df.loc[[1,2],["Name", "Age"]]

# NOTE : loc operator with [] brackets returns DataFrame with specified rows and column passed as list seperated by comma.

In [None]:
# to select specific rows or columns we have to call them with their indices
character_df.iloc[[1,2],[0,1]]

# NOTE : iloc operator with [] brackets returns DataFrame with specified rows and columns in indices passed as list seperated by comma.

In [None]:
# here we can filter out rows based on the data with the help of comparison operators
# the below code will return Series of True and False based on condition which we stores in variable
greater_than_20 = character_df["Age"] > 20

#we can pass that variable holding Series within brackets like below to actually extract the filtered data we want
character_df[greater_than_20]

### *ADDING/REMOVING/UPDATING COLUMNS*

In [None]:
# just like python's dictionary, through accessing column which isn't there, it willl create new column with performed new values

character_df["Age > 20"] = character_df["Age"] > 20
character_df

In [None]:
# with the help of del operator or pop() we can delete the column from DatFrame
del character_df["Age > 20"]

# character_df

<br><br>

### *SETTING INDICES THROUGH COLUMN*

In [None]:
character_df.set_index("Name", inplace=True)
character_df
# while running this line you will see that nothing changes in DataFrame, to actually make change we have to put "inplace=True" in above line, which by default is False

<br><br>

### *RESET INDICES*

In [139]:
# to undo previous changes in indexes we have to use reset_index() on DataFrame
character_df.reset_index(inplace=True)

<br><br>

### *SORTING ROWS*

In [140]:
character_df.sort_index(ascending=True)
# to permanently change DataFrame in sorted order, we should use "inplace=True"

Unnamed: 0,Name,Age,Sex
0,Levi Ackerman,32,Male
1,Anne Leonhart,23,Female
2,Eren Yeager,19,Male
3,Sasha Blause,20,Female
4,Riener Braun,23,Male
5,Jean Kirstein,20,Male


<br><br>

## FILTERING DATA

In [147]:
# NOTE : filtering data can be done with pandas comparison and logical operator

# suppose we have to work on female characters, we can filter out data with normal comparison operator
filt = character_df["Sex"] == "Female"          # this code will return series of True and False

# now with the help of "loc" we can filter out rows from above code and columns manually (which we want)
character_df.loc[filt, ["Name", "Age"]]

Unnamed: 0,Name,Age
1,Anne Leonhart,23
3,Sasha Blause,20


<br><br>

### WORKING ON DATA FROM STACKOVERFLOW SURVEY 2019

In [10]:
so_df = pd.read_csv("../data/survey_results_public.csv")
so_df

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88878,88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy
88879,88601,,No,Never,The quality of OSS and closed source software ...,,,,,,...,,,,,,,,,,
88880,88802,,No,Never,,Employed full-time,,,,,...,,,,,,,,,,
88881,88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,...,,,,,,,,,,


In [160]:
# now we can filter out data in more meaningfull way as the data is real this time :P
no_opensourcer = (so_df["OpenSourcer"] == "Never")  # here we are filtering only programmers who were "Never" opensourcer
# here we can use loc operator to access only those rows which we have to filter and columns which we want to work on futhur
so_df.loc[no_opensourcer, ["Hobbyist", "OpenSourcer", "Employment", "Country", "Student"]]

Unnamed: 0,Hobbyist,OpenSourcer,Employment,Country,Student
0,Yes,Never,"Not employed, and not looking for work",United Kingdom,No
2,Yes,Never,Employed full-time,Thailand,No
3,No,Never,Employed full-time,United States,No
5,Yes,Never,Employed full-time,Canada,No
6,No,Never,"Independent contractor, freelancer, or self-em...",Ukraine,No
...,...,...,...,...,...
88873,No,Never,,,
88874,No,Never,Employed full-time,,
88879,No,Never,,,
88880,No,Never,Employed full-time,,


<br><br>

### FILTERING MULTIPLE COLUMNS

In [162]:
# we can also use pandas logical operators to perform filtering with multiple conditions on multiple columns
filt = ((so_df["OpenSourcer"] == "Never") & (so_df["Employment"] == "Employed full-time"))
so_df[filt]

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,6,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Canada,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,28.0,Man,No,Straight / Heterosexual,East Asian,No,Too long,Neither easy nor difficult
15,16,I am a developer by profession,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,United Kingdom,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,26.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Neither easy nor difficult
18,19,I am a developer by profession,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Brazil,No,Some college/university study without earning ...,"Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,31.0,Man,No,Straight / Heterosexual,Hispanic or Latino/Latina,Yes,Too long,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88846,83612,,Yes,Never,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Thailand,,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,...,Not applicable - I did not use Stack Overflow ...,,,Woman,Yes,Gay or Lesbian,,,,
88851,83800,,No,Never,"OSS is, on average, of LOWER quality than prop...",Employed full-time,,,,,...,,,,,,,,,,
88853,84299,,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,India,"Yes, full-time","Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Somewhat more welcome now than last year,,,,,,,,,
88874,88076,,No,Never,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,,,,,...,,,,,,,,,,


<br><br>

### FILTERING MULTIPLE VALUES

In [163]:
countries = ["United States", "India", "United Kingdom", "Germany", "Canada"]
filt = so_df["Country"].isin(countries)
so_df.loc[filt, "Country"]

0        United Kingdom
3         United States
5                Canada
7                 India
9                 India
              ...      
88859     United States
88863    United Kingdom
88864             India
88877     United States
88878            Canada
Name: Country, Length: 45008, dtype: object

<br><br>

## UPDATE COLUMN

<br><br>

### CHANGING MULTIPLE COLUMNS NAME

In [300]:
# we can assign specific column a new name by rename() within which column parameter takes dictionary of key-value pair
# key -> value to change
# value -> new value
# to actually change columns name in DataFrame we have to use "inplace=True"


character_df.rename(columns={"Fullname":"Name", "Gender":"Sex"}, inplace=True)
character_df

Unnamed: 0,Name,Age,Sex,PetName,FirstName,LastName
0,LEVI ACKERMAN,32,M,Spinner,LEVI,ACKERMAN
1,Anne Leonhart,23,F,Silent Kid,Anne,Leonhart
2,Eren Yeager,19,M,Pysch,Eren,Yeager
3,SASHA BLAUSE,20,F,Potato Girl,SASHA,BLAUSE
4,RIENER BRAUN,23,M,I wanna die,RIENER,BRAUN
5,JEAN KIRSTEIN,20,M,Horse,JEAN,KIRSTEIN


<br><br>

### CHANGING MULTIPLE VALUES FROM ROWS OR COLUMNS

In [214]:
# here we have changed value in Age column for rows with index 1 and 2 by assigning them with list of new values
# can do same with multiple columns

character_df.loc[[1,2], "Age"] = [22,20]
character_df

Unnamed: 0,Name,Age,Sex
0,Levi Ackerman,32,Male
1,Anne Leonhart,22,Female
2,Eren Yeager,20,Male
3,Sasha Blause,20,Female
4,Riener Braun,23,Male
5,Jean Kirstein,20,Male


<br><br>

### CHANGING MULTIPLE VALUES FROM ROWS AND COLUMNS

In [215]:
# to change data from multiple rows and columns simeoultaneously we have to assign nested list to selected rows and columns of DataFrame

character_df.loc[[1,2],["Name", "Age"]] = [["Anne", 23],["Ereh",19]]
character_df

Unnamed: 0,Name,Age,Sex
0,Levi Ackerman,32,Male
1,Anne,23,Female
2,Ereh,19,Male
3,Sasha Blause,20,Female
4,Riener Braun,23,Male
5,Jean Kirstein,20,Male


<br><br>

### CHANGING MULTIPLE VALUES IN THE FILTERED DATA

In [216]:
filt = (character_df["Sex"] == "Female")
character_df.loc[filt, "Sex"] = "F"
character_df

Unnamed: 0,Name,Age,Sex
0,Levi Ackerman,32,Male
1,Anne,23,F
2,Ereh,19,Male
3,Sasha Blause,20,F
4,Riener Braun,23,Male
5,Jean Kirstein,20,Male


<br><br>

### APPLYING SOME OPERATION ON COLUMN(SERIES) WITH APPLY()

In [217]:
# apply() returns updated DataFrame after applying it on series(selected column from DataFrame)

character_df["Name"] = character_df["Name"].apply(lambda x: x.upper())
character_df

Unnamed: 0,Name,Age,Sex
0,LEVI ACKERMAN,32,Male
1,ANNE,23,F
2,EREH,19,Male
3,SASHA BLAUSE,20,F
4,RIENER BRAUN,23,Male
5,JEAN KIRSTEIN,20,Male


<br><br>

### APPLYING SOME OPERATION ON DATAFRAME WITH APPLY()

In [218]:
# here apply() will perform operation on each series rather than each data in it
character_df.apply(len)

Name    6
Age     6
Sex     6
dtype: int64

<br><br>


### APPLYING SOME OPERATION ONE EACH DATA WITHIN DATAFRAME WITH APPLYMAP()

<br><br>


### CHANGING DATA FOR SERIES WITH REPLACE()

In [220]:
# replace() will replace old values with new one on only specified data
# it takes dictionary with "key" being the old and "value" being the new value

character_df["Name"] = character_df["Name"].replace({"ANNE":"Anne Leonhart", "EREH":"Eren Yeager"})
character_df

Unnamed: 0,Name,Age,Sex
0,LEVI ACKERMAN,32,M
1,Anne Leonhart,23,F
2,Eren Yeager,19,M
3,SASHA BLAUSE,20,F
4,RIENER BRAUN,23,M
5,JEAN KIRSTEIN,20,M


<br><br>

### COMBINING COLUMNS FOR OUTPUT

In [223]:
character_df["Name"] + " -> " + character_df["Sex"]

0    LEVI ACKERMAN -> M
1    Anne Leonhart -> F
2      Eren Yeager -> M
3     SASHA BLAUSE -> F
4     RIENER BRAUN -> M
5    JEAN KIRSTEIN -> M
dtype: object

<br><br>

### ADDING NEW COLUMN

In [301]:
character_df["PetName"] = ["Spinner", "Silent Kid", "Pysch", "Potato Girl", "Suicider", "Horse"]
character_df

Unnamed: 0,Name,Age,Sex,PetName,FirstName,LastName
0,LEVI ACKERMAN,32,M,Spinner,LEVI,ACKERMAN
1,Anne Leonhart,23,F,Silent Kid,Anne,Leonhart
2,Eren Yeager,19,M,Pysch,Eren,Yeager
3,SASHA BLAUSE,20,F,Potato Girl,SASHA,BLAUSE
4,RIENER BRAUN,23,M,Suicider,RIENER,BRAUN
5,JEAN KIRSTEIN,20,M,Horse,JEAN,KIRSTEIN


<br><br>

### DROPING COLUMNS

In [303]:
# drop() will return a view of what would DataFrame look like after droping column, to actually delete it from DataFrame we have to put "inplace=True"
character_df.drop(columns=['PetName'])

Unnamed: 0,Name,Age,Sex,FirstName,LastName
0,LEVI ACKERMAN,32,M,LEVI,ACKERMAN
1,Anne Leonhart,23,F,Anne,Leonhart
2,Eren Yeager,19,M,Eren,Yeager
3,SASHA BLAUSE,20,F,SASHA,BLAUSE
4,RIENER BRAUN,23,M,RIENER,BRAUN
5,JEAN KIRSTEIN,20,M,JEAN,KIRSTEIN


<br><br>

### EXPANDING COLUMNS

In [231]:
character_df[["FirstName", "LastName"]] = character_df["Name"].str.split(' ', expand=True)
character_df

Unnamed: 0,Name,Age,Sex,PetName,FirstName,LastName
0,LEVI ACKERMAN,32,M,Spinner,LEVI,ACKERMAN
1,Anne Leonhart,23,F,Silent Kid,Anne,Leonhart
2,Eren Yeager,19,M,Pysch,Eren,Yeager
3,SASHA BLAUSE,20,F,Potato Girl,SASHA,BLAUSE
4,RIENER BRAUN,23,M,I wanna die,RIENER,BRAUN
5,JEAN KIRSTEIN,20,M,Horse,JEAN,KIRSTEIN


In [305]:
character_df = character_df.drop(columns=["FirstName", "LastName"])

<br><br>

### ADDING ROW(s)

In [306]:
# the below code will append new row, without "ignore_index=True" we can't append it in DataFrame
# there is no "inplace=True" in append method so we manully have to access it to DataFrame to permanently apply it
character_df =  character_df.append({"Name":"Tonry Stark", "Age":55, "Sex":"M"}, ignore_index=True)
character_df

  character_df =  character_df.append({"Name":"Tonry Stark", "Age":55, "Sex":"M"}, ignore_index=True)


Unnamed: 0,Name,Age,Sex,PetName
0,LEVI ACKERMAN,32,M,Spinner
1,Anne Leonhart,23,F,Silent Kid
2,Eren Yeager,19,M,Pysch
3,SASHA BLAUSE,20,F,Potato Girl
4,RIENER BRAUN,23,M,Suicider
5,JEAN KIRSTEIN,20,M,Horse
6,Tonry Stark,55,M,


<br><br>

### DROPING ROW(s)

In [307]:
# drop() will also drop rows with index parameter take index number (list of number if wanna drop multiple rows)
character_df = character_df.drop(index=[6])
character_df

Unnamed: 0,Name,Age,Sex,PetName
0,LEVI ACKERMAN,32,M,Spinner
1,Anne Leonhart,23,F,Silent Kid
2,Eren Yeager,19,M,Pysch
3,SASHA BLAUSE,20,F,Potato Girl
4,RIENER BRAUN,23,M,Suicider
5,JEAN KIRSTEIN,20,M,Horse


<br><br>

### SORTING

In [271]:
# sort_value() will sort DataFrame's value by given parameter in "by"(below code is sorted by FirstName)
# with ascending parameter we can sort it in ascending/descending order(by default it will be in ascending)
character_df.sort_values(by="FirstName", ascending=False)

# NOTE : we access multiple columns passed as list and sort them also in multiple ways also passed as list
# NOTE : sort() will only sort it out for output, to actually change it we have to use "inplace=True"

Unnamed: 0,Name,Age,Sex,PetName,FirstName,LastName
3,SASHA BLAUSE,20,F,Potato Girl,SASHA,BLAUSE
4,RIENER BRAUN,23,M,I wanna die,RIENER,BRAUN
0,LEVI ACKERMAN,32,M,Spinner,LEVI,ACKERMAN
5,JEAN KIRSTEIN,20,M,Horse,JEAN,KIRSTEIN
2,Eren Yeager,19,M,Pysch,Eren,Yeager
1,Anne Leonhart,23,F,Silent Kid,Anne,Leonhart


In [270]:
# to sort DataFrame based on index we have "sort_index()"
character_df.sort_index()

Unnamed: 0,Name,Age,Sex,PetName,FirstName,LastName
0,LEVI ACKERMAN,32,M,Spinner,LEVI,ACKERMAN
1,Anne Leonhart,23,F,Silent Kid,Anne,Leonhart
2,Eren Yeager,19,M,Pysch,Eren,Yeager
3,SASHA BLAUSE,20,F,Potato Girl,SASHA,BLAUSE
4,RIENER BRAUN,23,M,I wanna die,RIENER,BRAUN
5,JEAN KIRSTEIN,20,M,Horse,JEAN,KIRSTEIN
