# Handling Text Data

## Cleaning Text

In [1]:
data = ["I am. a good boy        ","      Moon is Earth's natural. satelite   ","  The jar . is empty"]

In [2]:
data

['I am. a good boy        ',
 "      Moon is Earth's natural. satelite   ",
 '  The jar . is empty']

In [4]:
#strip the whitespace
stripped_data = [string.strip() for string in data]

In [5]:
stripped_data

['I am. a good boy', "Moon is Earth's natural. satelite", 'The jar . is empty']

In [6]:
# remove unnecessary','
fixed_data = [string.replace(".","") for string in stripped_data]

In [7]:
fixed_data

['I am a good boy', "Moon is Earth's natural satelite", 'The jar  is empty']

In [8]:
# custom function
def capitalize(string):
    return string.upper()

#capitalize string
capitalize_data = [capitalize(string) for string in fixed_data]

In [9]:
capitalize_data 

['I AM A GOOD BOY', "MOON IS EARTH'S NATURAL SATELITE", 'THE JAR  IS EMPTY']

## Removing Punctuations

In [10]:
import unicodedata
import sys

In [11]:
text_data = ['Hi!!!! I. Love. This. Song....','10000% Agree!!!! #LoveIT','Right?!?!']

In [12]:
text_data

['Hi!!!! I. Love. This. Song....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

In [13]:
# Create a dictionary of punctuation characters
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('p'))

In [14]:
[string.translate(punctuation) for string in text_data]

['Hi!!!! I. Love. This. Song....', '10000% Agree!!!! #LoveIT', 'Right?!?!']

## Creating feature matrix from text data

In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
text_data = np.array(['I love India. India!','USA is a developed country','England has better environment'])

In [20]:
text_data

array(['I love India. India!', 'USA is a developed country',
       'England has better environment'], dtype='<U30')

In [21]:
count = CountVectorizer()

In [22]:
feature_words = count.fit_transform(text_data)

In [23]:
print(feature_words)

  (0, 8)	1
  (0, 6)	2
  (1, 9)	1
  (1, 7)	1
  (1, 2)	1
  (1, 1)	1
  (2, 3)	1
  (2, 5)	1
  (2, 0)	1
  (2, 4)	1


In [25]:
feature_text = feature_words.toarray()

In [26]:
feature_text

array([[0, 0, 0, 0, 0, 0, 2, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 0, 0, 0]], dtype=int64)

In [27]:
feature_name = count.get_feature_names()

In [28]:
feature_name

['better',
 'country',
 'developed',
 'england',
 'environment',
 'has',
 'india',
 'is',
 'love',
 'usa']

In [29]:
dataframe = pd.DataFrame(data = feature_text,columns = feature_name)

In [30]:
dataframe

Unnamed: 0,better,country,developed,england,environment,has,india,is,love,usa
0,0,0,0,0,0,0,2,0,1,0
1,0,1,1,0,0,0,0,1,0,1
2,1,0,0,1,1,1,0,0,0,0


## Creating feature by assigning weight

In [33]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
text_data = np.array(['I love India. India!','USA is a developed country','England has better environment'])

In [37]:
tfidf = TfidfVectorizer()

In [38]:
feature_matrix = tfidf.fit_transform(text_data)

In [39]:
print(feature_matrix)

  (0, 6)	0.8944271909999159
  (0, 8)	0.4472135954999579
  (1, 1)	0.5
  (1, 2)	0.5
  (1, 7)	0.5
  (1, 9)	0.5
  (2, 4)	0.5
  (2, 0)	0.5
  (2, 5)	0.5
  (2, 3)	0.5


In [40]:
feature_array = feature_matrix.toarray()