#### Import Statements

In [1]:
import pandas as pd
import numpy as np

#### Reading the Data Files

----------

In [2]:
schema_df = pd.read_csv(
    "PandasSampleData/survey_results_schema.csv", index_col="Column"
)
na_vals = ["NA", "N/A", "Missing", "MISSING"]
df = pd.read_csv("PandasSampleData/survey_results_public.csv", na_values=na_vals)

In [3]:
dt_df = pd.read_csv("PandasSampleData/ETH_1h.csv")

In [4]:
# We have to convert the Date column to pandas datetime object as Date is stored as string. 
dt_df["Date"] = pd.to_datetime(dt_df["Date"], format="%Y-%m-%d %I-%p")

In [5]:
dt_df.set_index("Date", inplace=True)

-------

In [6]:
schema_df.sort_index(ascending=True, inplace=True)

In [7]:
schema_df.loc["CompTotal", "QuestionText"]

'What is your current total compensation (salary, bonuses, and perks, before taxes and deductions), in `CurrencySymbol`? Please enter a whole number in the box below, without any punctuation. If you are paid hourly, please estimate an equivalent weekly, monthly, or yearly salary. If you prefer not to answer, please leave the box empty.'

In [8]:
df.value_counts("Age") # Pass in 'normalize=True' to see the output in percentage of 1.  

Age
25.0     2693
28.0     2412
30.0     2406
26.0     2391
27.0     2338
         ... 
39.5        1
35.7        1
31.5        1
34.5        1
279.0       1
Length: 110, dtype: int64

In [9]:
# We can also use String methods in pandas by using ".str" class to manipulate strings
countries = ["India", "Bangladesh"]
filt = (
    (df["ConvertedComp"] > 70000)
    & (df["Country"].isin(countries))
    & ((df["LanguageWorkedWith"]).str.contains("Python", na=False))
)
# df.loc[salary_filt, ["Country", "Age", "ConvertedComp", "CurrencyDesc", "UndergradMajor", "EdLevel", "DevType", "LanguageWorkedWith", "YearsCode", "YearsCodePro"]]
df.loc[filt, ["Age", "ConvertedComp"]].sample(3)

Unnamed: 0,Age,ConvertedComp
20047,22.0,117276.0
38128,,155976.0
6297,,275772.0


In [10]:
# Sorting the Stack Overflow data frame to see who 'Earns the most' and if there are people who earns the same...
# ...then sort them by their 'Age'
df.sort_values(by=["ConvertedComp", "Age"], ascending=[False, False])[["Country", "ConvertedComp", "Age", "LanguageWorkedWith"]].head(20)

# Actually theres an easy way to get the same data using '.nlargest' as we are actually sorting 20 rows of data...
# ... by largest ConvertedComp Value
df.nlargest(3, "ConvertedComp")
# we can also use 'nsmallest' to sort by Smallest Values

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
121,123,I am a developer by profession,Yes,26.0,12,Weekly,120000.0,2000000.0,United States,United States dollar,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Flask;jQuery;React.js,Spring,Just as welcome now as I felt last year,36.0,8,3
123,125,"I am not primarily a developer, but I write co...",Yes,41.0,30,Monthly,200000.0,2000000.0,United States,United States dollar,...,Easy,Appropriate in length,No,,,,Just as welcome now as I felt last year,40.0,11,11
191,193,I am a developer by profession,Yes,29.0,16,Weekly,120000.0,2000000.0,United States,United States dollar,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,40.0,13,7


In [11]:
""" Know things about your data : """
# .median() gives us the median while .describe() describes the data by returning various statistical measures
df["Age"].median()
df.describe()

Unnamed: 0,Respondent,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,64461.0,45446.0,34826.0,34756.0,41151.0
mean,32554.079738,30.834111,3.190464e+242,103756.1,40.782174
std,18967.44236,9.585392,inf,226885.3,17.816383
min,1.0,1.0,0.0,0.0,1.0
25%,16116.0,24.0,20000.0,24648.0,40.0
50%,32231.0,29.0,63000.0,54049.0,40.0
75%,49142.0,35.0,125000.0,95000.0,44.0
max,65639.0,279.0,1.1111110000000001e+247,2000000.0,475.0


In [12]:
# grouping the data by country to see the value counts of age of those countries
country_grp = df.groupby(["Country"])

In [13]:
country_grp["ConvertedComp"].median().loc["Germany"]  # Cause Country names are the new indexes now.

62697.0

In [14]:
# to apply multiple aggregate functions : 
country_grp["ConvertedComp"].aggregate(["median", "mean", "std", "var"])

Unnamed: 0_level_0,median,mean,std,var
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,15163.5,148290.125000,332705.942277,1.106932e+11
Albania,15900.0,25611.000000,40946.849387,1.676644e+09
Algeria,9432.0,10362.812500,8802.055769,7.747619e+07
Andorra,88640.0,88640.000000,74908.063972,5.611218e+09
Angola,5292.0,5292.000000,627.910822,3.942720e+05
...,...,...,...,...
"Venezuela, Bolivarian Republic of...",3600.0,6280.611111,6714.752459,4.508790e+07
Viet Nam,10344.0,28342.605769,99697.518064,9.939595e+09
Yemen,36000.0,36000.000000,,
Zambia,5452.0,17506.400000,24930.397125,6.215247e+08


In [15]:
# We use 'apply' instead of using '.str' directly as 'SeriesGroupBy' object has no attribute 'str'
knows_py = country_grp["LanguageWorkedWith"].apply(lambda x: x.str.contains("Python", na=False).value_counts(normalize=True))

countries = list(df.Country)
truth = [True for x in range(len(countries))]

index_to_look = list(zip(countries, truth))

knows_py.reindex(index_to_look)

Country                 
Germany             True    0.440103
United Kingdom      True    0.416068
Russian Federation  True    0.385531
Albania             True    0.240741
United States       True    0.478306
                              ...   
                    True    0.478306
Morocco             True    0.380682
Viet Nam            True    0.280220
Poland              True    0.398279
Spain               True    0.385874
Name: LanguageWorkedWith, Length: 64461, dtype: float64

In [16]:
num_coders_knows_py = pd.DataFrame(country_grp["LanguageWorkedWith"].apply(lambda x: x.str.contains("Python", na=False).sum()))

num_coders = pd.DataFrame(df["Country"].value_counts())

country_coders = pd.concat([num_coders_knows_py, num_coders], axis=1, sort=True)

country_coders.rename(columns={"LanguageWorkedWith": "Num_Python_Coders", "Country": "Num_Coders"}, inplace=True)

country_coders["Py_Coders_Percentage"] = (country_coders["Num_Python_Coders"] / country_coders["Num_Coders"]) * 100

In [17]:
country_coders

Unnamed: 0,Num_Python_Coders,Num_Coders,Py_Coders_Percentage
Afghanistan,11,84,13.095238
Albania,13,54,24.074074
Algeria,40,94,42.553191
Andorra,3,13,23.076923
Angola,1,9,11.111111
...,...,...,...
"Venezuela, Bolivarian Republic of...",29,70,41.428571
Viet Nam,102,364,28.021978
Yemen,1,7,14.285714
Zambia,4,21,19.047619


In [18]:
df["YearsCode"] = df["YearsCode"].replace({"Less than 1 year": 0, "More than 50 years":51})
df["YearsCode"] = df["YearsCode"].astype(float)
df["YearsCode"].mean()

12.709052770265584

In [19]:
dt_df.resample("W").agg({"High": "max", "Low": "min", "Volume": "sum"})

Unnamed: 0_level_0,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-07-02,293.73,253.23,8.084631e+07
2017-07-09,285.00,231.25,2.246746e+08
2017-07-16,240.33,130.26,5.017750e+08
2017-07-23,249.40,153.25,7.221637e+08
2017-07-30,229.99,178.03,2.657305e+08
...,...,...,...
2020-02-16,290.00,216.31,3.912867e+08
2020-02-23,287.13,242.36,3.067838e+08
2020-03-01,278.13,209.26,3.693920e+08
2020-03-08,253.01,196.00,2.736569e+08
