### Textual Categorical-Features

#### Ordinal Features

In [1]:
import pandas as pd

In [2]:
ordered_satisfaction = ['Very Unhappy', 'Unhappy', 'Neutral', 'Happy', 'Very Happy']

In [3]:
df = pd.DataFrame({'satisfaction':['Mad', 'Happy', 'Unhappy', 'Neutral']})

In [4]:
df

Unnamed: 0,satisfaction
0,Mad
1,Happy
2,Unhappy
3,Neutral


In [5]:
df.satisfaction = df.satisfaction.astype("category", ordered = True, categories = ordered_satisfaction).cat.codes

In [6]:
df.satisfaction

0   -1
1    3
2    1
3    2
Name: satisfaction, dtype: int8

#### "Mad" returns -1 because it is not in the designated category "ordered_satisfaction"

In [7]:
df

Unnamed: 0,satisfaction
0,-1
1,3
2,1
3,2


#### Nominal Features

In [8]:
df = pd.DataFrame({'vertebrates':['Bird','Bird','Mammal','Fish','Amphibian','Reptile','Mammal',]})

In [9]:
df

Unnamed: 0,vertebrates
0,Bird
1,Bird
2,Mammal
3,Fish
4,Amphibian
5,Reptile
6,Mammal


Method 1

In [10]:
df["vertebrates"] = df.vertebrates.astype("category").cat.codes

In [11]:
df

Unnamed: 0,vertebrates
0,1
1,1
2,3
3,2
4,0
5,4
6,3


Method 2

In [12]:
df = pd.DataFrame({'vertebrates':['Bird','Bird','Mammal','Fish','Amphibian','Reptile','Mammal',]})

In [13]:
df

Unnamed: 0,vertebrates
0,Bird
1,Bird
2,Mammal
3,Fish
4,Amphibian
5,Reptile
6,Mammal


In [15]:
df = pd.get_dummies(df,columns = ["vertebrates"])

In [16]:
df

Unnamed: 0,vertebrates_Amphibian,vertebrates_Bird,vertebrates_Fish,vertebrates_Mammal,vertebrates_Reptile
0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,1.0,0.0


### Pure Textual Features

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
corpus = ["Authman ran faster than Harry because he is an athlete.","Authman and Harry ran faster and faster.",]

In [19]:
bag_of_words = CountVectorizer()

In [20]:
X = bag_of_words.fit_transform(corpus)

In [21]:
bag_of_words.get_feature_names()

['an',
 'and',
 'athlete',
 'authman',
 'because',
 'faster',
 'harry',
 'he',
 'is',
 'ran',
 'than']

In [23]:
X.toarray() ## converts it to a regular Python list

array([[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0]], dtype=int64)

### Graphical Features

**Resize the picture to a fixed size, convert it to grayscale, then encode every pixel as an element in a unidimensional feature array**

In [24]:
from scipy import misc

In [25]:
img = misc.imread(r"C:\Users\aaaaaaaa\Desktop\edX ML\Module 2\Module2\ml_map.png")

In [26]:
img = img[::2,::2]

In [28]:
X = img/255.0

In [30]:
X.shape

(662, 1061, 4)

In [32]:
X.reshape(-1,4)

array([[ 1.        ,  1.        ,  1.        ,  0.96862745],
       [ 1.        ,  1.        ,  1.        ,  0.96862745],
       [ 1.        ,  1.        ,  1.        ,  0.96862745],
       ..., 
       [ 1.        ,  1.        ,  1.        ,  0.96862745],
       [ 1.        ,  1.        ,  1.        ,  0.96862745],
       [ 1.        ,  1.        ,  1.        ,  0.96862745]])