### Data Wrangling with Python - by Packt Publishing 2019 book

In [1]:
big_list_of_numbers = [1 for x in range(0, 1000)]

In [2]:
from sys import getsizeof

In [3]:
getsizeof(big_list_of_numbers)

9024

In [4]:
from itertools import repeat

small_list_of_numbers = repeat(1, times=1000)

getsizeof(small_list_of_numbers)

56

In [5]:
# The above shows that iterators are handy as they generate numbers only when needed, and use much lesser space

In [6]:
# Stack in python

In [7]:
stack=[]

stack.append(25)

In [8]:
stack

[25]

In [9]:
stack.append(-32)

In [10]:
stack

[25, -32]

In [11]:
popped = stack.pop()

In [12]:
popped

-32

In [13]:
stack

[25]

In [14]:
# Lambda function to sort a list of tuples

In [15]:
capitals = [("USA", "Washington"), ("India", "Delhi"), ("France", "Paris"), ("UK", "London")]

capitals

[('USA', 'Washington'),
 ('India', 'Delhi'),
 ('France', 'Paris'),
 ('UK', 'London')]

In [16]:
# let's sort the list by the capitals of each country

In [17]:
capitals.sort(key=lambda item: item[1])

In [18]:
capitals

[('India', 'Delhi'),
 ('UK', 'London'),
 ('France', 'Paris'),
 ('USA', 'Washington')]

In [19]:
# finding out if all the words in a list are part of another list

In [20]:
check_for = ["How", "are"]

In [21]:
list_of_words = ["Hello", "there.", "How", "are", "you", "doing?"]

In [22]:
all(w in list_of_words for w in check_for)

True

In [24]:
all(w in list_of_words for w in ["How", "are"])

True

In [25]:
all(w in list_of_words for w in ["How", "bye"])

False

In [26]:
# implementing queue

In [27]:
queue=[]

queue.append(25)
queue.append(-32)

queue

[25, -32]

In [28]:
popped = queue.pop(0)

In [30]:
popped

25

In [29]:
queue

[-32]

In [31]:
# the above traditional approach of popping 0th element by pop(0) is time intensive

In [32]:
# let's implement queue using deque

In [33]:
from collections import deque

In [34]:
queue=deque()

In [35]:
queue

deque([])

In [37]:
queue.append(25)

In [38]:
queue.append(-32)

In [39]:
queue

deque([25, -32])

In [40]:
popped = queue.popleft()

In [41]:
popped

25

In [42]:
queue

deque([-32])

In [43]:
# Data Wrangling

In [44]:
import numpy as np

In [45]:
import pandas as pd

In [46]:
df = pd.read_excel('Sample - Superstore.xls')

In [47]:
df.size

209874

In [49]:
len(df)

9994

In [51]:
df.sample(2)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
613,614,CA-2016-121223,2016-09-11,2016-09-13,Second Class,GD-14590,Giulietta Dortch,Corporate,United States,Philadelphia,...,19143,East,TEC-PH-10004667,Technology,Phones,Cisco 8x8 Inc. 6753i IP Business Phone System,728.946,9,0.4,-157.9383
3686,3687,CA-2016-102792,2016-12-13,2016-12-19,Standard Class,JC-15340,Jasper Cacioppo,Consumer,United States,Riverside,...,92503,West,OFF-AR-10004757,Office Supplies,Art,Crayola Colored Pencils,9.84,3,0.0,3.2472


In [85]:
df.sample(frac=0.0002) # the % of rows to be shown

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
2642,CA-2016-124051,2016-12-29,2016-12-30,First Class,KA-16525,Kelly Andreada,Consumer,United States,Aurora,Illinois,60505,Central,OFF-PA-10001289,Office Supplies,Paper,White Computer Printout Paper by Universal,186.048,6,0.2,67.4424
7042,US-2017-165358,2017-07-18,2017-07-23,Standard Class,SV-20365,Seth Vernon,Consumer,United States,Philadelphia,Pennsylvania,19134,East,TEC-CO-10001943,Technology,Copiers,Canon PC-428 Personal Copier,599.97,5,0.4,69.9965


In [53]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Row ID,9994.0,4997.5,2885.163629,1.0,2499.25,4997.5,7495.75,9994.0
Postal Code,9994.0,55190.379428,32063.69335,1040.0,23223.0,56430.5,90008.0,99301.0
Sales,9994.0,229.858001,623.245101,0.444,17.28,54.49,209.94,22638.48
Quantity,9994.0,3.789574,2.22511,1.0,2.0,3.0,5.0,14.0
Discount,9994.0,0.156203,0.206452,0.0,0.0,0.2,0.2,0.8
Profit,9994.0,28.656896,234.260108,-6599.978,1.72875,8.6665,29.364,8399.976


In [55]:
df.isnull().sum().sum()

0

In [56]:
df.drop('Row ID', axis=1, inplace=True)

In [57]:
df.sample(3)

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9716,US-2016-144477,2016-08-12,2016-08-14,Second Class,DB-13270,Deborah Brumfield,Home Office,United States,Philadelphia,Pennsylvania,19134,East,OFF-AR-10000914,Office Supplies,Art,Boston 16765 Mini Stand Up Battery Pencil Shar...,37.312,4,0.2,2.7984
4327,CA-2016-153346,2016-01-25,2016-01-27,First Class,TB-21355,Todd Boyes,Corporate,United States,Plainfield,New Jersey,7060,East,OFF-PA-10000007,Office Supplies,Paper,Telephone Message Books with Fax/Mobile Sectio...,18.0,5,0.0,8.28
1222,CA-2016-126004,2016-12-04,2016-12-05,First Class,BM-11140,Becky Martin,Consumer,United States,New York City,New York,10024,East,FUR-FU-10001602,Furniture,Furnishings,"Eldon Delta Triangular Chair Mat, 52"" x 58"", C...",113.79,3,0.0,20.4822


In [58]:
df_subset = df.loc[
                    [i for i in range(5,10)],
                    ['Customer ID','Customer Name','City','Postal Code','Sales']
                ]

In [59]:
df_subset # only rows from 5-9 are displayed

Unnamed: 0,Customer ID,Customer Name,City,Postal Code,Sales
5,BH-11710,Brosina Hoffman,Los Angeles,90032,48.86
6,BH-11710,Brosina Hoffman,Los Angeles,90032,7.28
7,BH-11710,Brosina Hoffman,Los Angeles,90032,907.152
8,BH-11710,Brosina Hoffman,Los Angeles,90032,18.504
9,BH-11710,Brosina Hoffman,Los Angeles,90032,114.9


In [60]:
df['State'].nunique() # number of unique items in the State column

49

In [63]:
df_subset[(df_subset['City']=='Los Angeles') & (df_subset['Sales']>100)]

Unnamed: 0,Customer ID,Customer Name,City,Postal Code,Sales
7,BH-11710,Brosina Hoffman,Los Angeles,90032,907.152
9,BH-11710,Brosina Hoffman,Los Angeles,90032,114.9


In [64]:
df_subset = df.loc[[i for i in range (10)],['Ship Mode','State','Sales']]

In [65]:
df_subset

Unnamed: 0,Ship Mode,State,Sales
0,Second Class,Kentucky,261.96
1,Second Class,Kentucky,731.94
2,Second Class,California,14.62
3,Standard Class,Florida,957.5775
4,Standard Class,Florida,22.368
5,Standard Class,California,48.86
6,Standard Class,California,7.28
7,Standard Class,California,907.152
8,Standard Class,California,18.504
9,Standard Class,California,114.9


In [68]:
df_subset.groupby('State').describe().loc['California'] # Single brackets around 'California'

Sales  count      6.000000
       mean     185.219333
       std      355.889307
       min        7.280000
       25%       15.591000
       50%       33.682000
       75%       98.390000
       max      907.152000
Name: California, dtype: float64

In [71]:
df_subset.groupby('State').describe().loc[['California']] # double brackets around 'California'

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
California,6.0,185.219333,355.889307,7.28,15.591,33.682,98.39,907.152


In [69]:
pd.DataFrame(df_subset.groupby('State').describe().loc['California']) # same as above but now inside a DF command

Unnamed: 0,Unnamed: 1,California
Sales,count,6.0
Sales,mean,185.219333
Sales,std,355.889307
Sales,min,7.28
Sales,25%,15.591
Sales,50%,33.682
Sales,75%,98.39
Sales,max,907.152


In [70]:
df_subset.groupby('Ship Mode').describe().loc[['Second Class','Standard Class']]

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Ship Mode,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Second Class,3.0,336.173333,364.373037,14.62,138.29,261.96,496.95,731.94
Standard Class,7.0,296.663071,435.947552,7.28,20.436,48.86,511.026,957.5775


In [72]:
df.groupby(['State','City']).describe()['Sales']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Auburn,6.0,294.471667,361.914543,3.760,8.8050,182.030,456.4075,900.080
Alabama,Decatur,13.0,259.601538,385.660903,14.940,23.9200,44.950,239.9200,1215.920
Alabama,Florence,5.0,399.470000,796.488863,4.980,7.2700,12.480,152.7600,1819.860
Alabama,Hoover,4.0,131.462500,230.646923,7.160,13.3925,20.725,138.7950,477.240
Alabama,Huntsville,10.0,248.437000,419.576667,3.620,26.8700,81.920,171.8075,1319.960
...,...,...,...,...,...,...,...,...,...
Wisconsin,Superior,9.0,144.414444,213.394065,5.560,17.1200,47.400,125.9900,629.100
Wisconsin,Waukesha,1.0,54.500000,,54.500,54.5000,54.500,54.5000,54.500
Wisconsin,Wausau,4.0,79.370000,111.450605,12.390,20.0325,29.605,88.9425,245.880
Wisconsin,West Allis,2.0,125.240000,165.067007,8.520,66.8800,125.240,183.6000,241.960


In [73]:
# Missing values

In [74]:
df_missing=pd.read_excel("Sample - Superstore.xls",sheet_name="Missing")

In [75]:
df_missing

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit
0,Brosina Hoffman,,1706.184,9.0,0.2,85.3092
1,Brosina Hoffman,Phones,911.424,4.0,0.2,68.3568
2,Zuschuss Donatelli,Art,8.56,2.0,0.0,2.4824
3,Zuschuss Donatelli,Phones,,3.0,0.2,16.011
4,Zuschuss Donatelli,Binders,22.72,4.0,0.2,7.384
5,Eric Hoffmann,Binders,11.648,,0.2,4.2224
6,Eric Hoffmann,Accessories,90.57,3.0,0.0,11.7741
7,Ruben Ausman,,77.88,2.0,0.0,
8,,Accessories,13.98,2.0,0.0,6.1512
9,Kunst Miller,Binders,25.824,6.0,0.2,9.3612


In [77]:
df_missing[['Sales']].fillna(df_missing['Sales'].mean()) # value of 301.552 is the mean value

Unnamed: 0,Sales
0,1706.184
1,911.424
2,8.56
3,301.552
4,22.72
5,11.648
6,90.57
7,77.88
8,13.98
9,25.824


In [86]:
df_missing

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit
0,Brosina Hoffman,,1706.184,9.0,0.2,85.3092
1,Brosina Hoffman,Phones,911.424,4.0,0.2,68.3568
2,Zuschuss Donatelli,Art,8.56,2.0,0.0,2.4824
3,Zuschuss Donatelli,Phones,,3.0,0.2,16.011
4,Zuschuss Donatelli,Binders,22.72,4.0,0.2,7.384
5,Eric Hoffmann,Binders,11.648,,0.2,4.2224
6,Eric Hoffmann,Accessories,90.57,3.0,0.0,11.7741
7,Ruben Ausman,,77.88,2.0,0.0,
8,,Accessories,13.98,2.0,0.0,6.1512
9,Kunst Miller,Binders,25.824,6.0,0.2,9.3612


In [87]:
df_missing['Double Profit'] = df_missing['Profit'].apply(lambda x: x*2) # applying lambda function to df['Column']

In [88]:
df_missing

Unnamed: 0,Customer,Product,Sales,Quantity,Discount,Profit,Double Profit
0,Brosina Hoffman,,1706.184,9.0,0.2,85.3092,170.6184
1,Brosina Hoffman,Phones,911.424,4.0,0.2,68.3568,136.7136
2,Zuschuss Donatelli,Art,8.56,2.0,0.0,2.4824,4.9648
3,Zuschuss Donatelli,Phones,,3.0,0.2,16.011,32.022
4,Zuschuss Donatelli,Binders,22.72,4.0,0.2,7.384,14.768
5,Eric Hoffmann,Binders,11.648,,0.2,4.2224,8.4448
6,Eric Hoffmann,Accessories,90.57,3.0,0.0,11.7741,23.5482
7,Ruben Ausman,,77.88,2.0,0.0,,
8,,Accessories,13.98,2.0,0.0,6.1512,12.3024
9,Kunst Miller,Binders,25.824,6.0,0.2,9.3612,18.7224


In [89]:
!pip install scipy python-Levenshtein

Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.0.tar.gz (48 kB)
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py): started
  Building wheel for python-Levenshtein (setup.py): finished with status 'done'
  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.0-cp37-cp37m-win_amd64.whl size=82193 sha256=e07a44d211986ddf44090c678db47952f00c4b62304220774ce5e415dada95f8
  Stored in directory: c:\users\public.desktop-6rbqt7l\appdata\local\pip\cache\wheels\f0\9b\13\49c281164c37be18343230d3cd0fca29efb23a493351db0009
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.0


In [90]:
# zip functions

In [91]:
country = ['india', 'denmark', 'canada', 'usa']
capital = ['delhi', 'copenhagen']

In [92]:
dict(zip(country,capital)) # zip ends as soon as the shortest of the lists ends

{'india': 'delhi', 'denmark': 'copenhagen'}

In [93]:
from itertools import zip_longest

In [94]:
dict(zip_longest(country,capital)) # zip_longest continues as far as every element is exhausted

{'india': 'delhi', 'denmark': 'copenhagen', 'canada': None, 'usa': None}