## String Manipulation

In [68]:
import pandas as pd
import numpy as np
import re
import random

In [69]:
val = "a,b, guido"
val

'a,b, guido'

In [70]:
val.split(",")

['a', 'b', ' guido']

In [71]:
sub = [x.strip() for x in val.split(",")]
sub

['a', 'b', 'guido']

In [72]:
a, b, c, = sub

In [73]:
a+ "::" + b+"::"+c

'a::b::guido'

In [74]:
"::".join(sub) # Pythonic way

'a::b::guido'

In [75]:
# detect a substring 
a in sub

True

In [76]:
val.index(",")

1

In [79]:
val.index(":")

ValueError: substring not found

In [78]:
val.find(",")

1

In [80]:
val.count(",")

2

In [82]:
val.replace(",",":::")

'a:::b::: guido'

In [83]:
## re - Regular Expression - Pattern matching, subsitiution, and splitting

import re
text = "foo bar\t baz \tqux"
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [84]:
regex = re.compile("\s+")
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [85]:
regex.findall(text) # all pattern matching the regex. 

[' ', '\t ', ' \t']

In [86]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [87]:
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [89]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [90]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [91]:
m= regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [93]:
text[m.start():m.end()] # 1st instance Start and end of the matching string

'dave@google.com'

In [94]:
print(regex.match(text)) # check only start of string

None


In [96]:
print(regex.sub("MATCHED12", text
               )) # Repalce new string with match

Dave MATCHED12
Steve MATCHED12
Rob MATCHED12
Ryan MATCHED12


In [97]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
regex = re.compile(pattern, flags=re.IGNORECASE)

In [98]:
m = regex.match("wesm@bright.net")
m

<re.Match object; span=(0, 15), match='wesm@bright.net'>

In [100]:
m.groups()

('wesm', 'bright', 'net')

In [101]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [103]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


### String function in pandas

In [107]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",
 .....: "Rob": "rob@gmail.com", "Wes": np.nan}
data

{'Dave': 'dave@google.com',
 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com',
 'Wes': nan}

In [108]:
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [109]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [110]:
# contain

data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [111]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [112]:
data.str.findall(pattern, flags=re.IGNORECASE).str[0]

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [114]:
data.str.extract(pattern, flags=re.IGNORECASE) # Return as a dataframe

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


In [115]:
## Categorical Data 

In [116]:
values = pd.Series([0, 1, 0, 0] * 2)
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [117]:
dim = pd.Series(['apple', 'orange'])
dim

0     apple
1    orange
dtype: object

In [118]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [119]:
fruits = ['apple', 'orange', 'apple', 'apple']*2
N = len(fruits)

In [120]:
df = pd.DataFrame({'fruit': fruits, 
                  'basket_id':range(N),
                  'count': random.sample(range(N), N), 
                  'weight':np.random.uniform(0,4, size = N)}, 
                 columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,5,1.56545
1,1,orange,1,3.302753
2,2,apple,6,0.327617
3,3,apple,4,3.806952
4,4,apple,0,2.526736
5,5,orange,3,1.044284
6,6,apple,2,2.638088
7,7,apple,7,2.327018


In [121]:
fruit_cat = df["fruit"].astype("category")
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [122]:
c = fruit_cat.array
c

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [123]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [124]:
c.categories

Index(['apple', 'orange'], dtype='object')

## Computations with Categories

In [125]:
rng = np.random.default_rng(seed= 12345)

draws = rng.standard_normal(1000)

In [126]:
draws[:5]

array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331])

In [127]:
bins = pd.cut(draws, 4 )

In [128]:
bins

[(-1.537, 0.0459], (0.0459, 1.629], (-1.537, 0.0459], (-1.537, 0.0459], (-1.537, 0.0459], ..., (0.0459, 1.629], (0.0459, 1.629], (-1.537, 0.0459], (0.0459, 1.629], (-1.537, 0.0459]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.126, -1.537] < (-1.537, 0.0459] < (0.0459, 1.629] < (1.629, 3.211]]

In [129]:
bins = pd.cut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
bins

['Q2', 'Q3', 'Q2', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
Length: 1000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [130]:
bins.codes

array([1, 2, 1, 1, 1, 1, 1, 2, 2, 0, 3, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 1,
       2, 0, 1, 2, 1, 1, 3, 3, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 1, 1, 2, 1, 2, 0, 1,
       2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 0, 2, 1, 1, 1, 2,
       2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 0, 1, 2, 1, 0, 1, 2, 2, 1,
       3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 1, 1,
       2, 1, 1, 0, 2, 1, 2, 2, 1, 2, 0, 1, 2, 2, 2, 1, 1, 1, 2, 3, 0, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 1, 3, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2,
       1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 3, 1, 3, 1, 1, 2, 1, 3, 2, 2,
       2, 3, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 0, 2, 3, 1, 1, 0, 1, 2,
       1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 1, 1, 0, 1, 3, 2,
       2, 1, 1, 1, 2, 1, 1, 2, 0, 1, 1, 1, 1, 1, 2,

In [131]:
bins.categories

Index(['Q1', 'Q2', 'Q3', 'Q4'], dtype='object')

In [132]:
bins = pd.Series(bins, name='quartiles')
bins

0      Q2
1      Q3
2      Q2
3      Q2
4      Q2
       ..
995    Q3
996    Q3
997    Q2
998    Q3
999    Q2
Name: quartiles, Length: 1000, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [133]:
result = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())

In [134]:
result

Unnamed: 0,quartiles,count,min,max
0,Q1,65,-3.119609,-1.538154
1,Q2,443,-1.523449,0.045684
2,Q3,438,0.048765,1.621952
3,Q4,54,1.651471,3.211418


### Categorical methods

In [135]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2)
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: object

In [136]:
cat_s = s.astype('category')
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [137]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [140]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [141]:
cat_s.value_counts()

a    2
b    2
c    2
d    2
Name: count, dtype: int64

In [142]:
#creating dummy variables for modellings
pd.get_dummies(cat_s, prefix='cat_s').applymap(lambda x:int(x))

Unnamed: 0,cat_s_a,cat_s_b,cat_s_c,cat_s_d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1
