# Basic Text Features

In [1]:
# sample text string
text = "Dark matter is one of the greatest enigmas of astrophysics and cosmology"


We will split the string into individual words or tokens. This is also known as __tokenization__.

In [3]:
# split words of the text
text.split()

['Dark',
 'matter',
 'is',
 'one',
 'of',
 'the',
 'greatest',
 'enigmas',
 'of',
 'astrophysics',
 'and',
 'cosmology']

In [4]:
# store the individual words in a variable
words = text.split()

### 1. Number of Words

In [5]:
# word count
len(words)

12

### 2. Number of Spaces

In [6]:
# spaces count
text.count(' ')

11

### 3. Number of Characters

In [7]:
# character count
len(text)

72

Even the spaces have been included.

In [8]:
# character count (excluding spaces)
len(text)-text.count(' ')

61

So, the text string has 61 characters excluding spaces.

### 4. Average Word Length

In [9]:
# empty list for
word_lengths = []

for i in text.split():
    word_lengths.append(len(i))
    
print(word_lengths)

[4, 6, 2, 3, 2, 3, 8, 7, 2, 12, 3, 9]


In [9]:
# average word length
sum(word_lengths)/len(word_lengths)

5.083333333333333

---

# Create Features for Twitter Dataset

Let's create the above mentioned features for a real-life dataset. 

In [10]:
import pandas as pd

In [12]:
tweets = pd.read_csv("datasets\\tweets.csv")

Have a glimpse at the data.

In [13]:
tweets.head()


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [14]:
tweets.iloc[0]

id                                                       1
label                                                    0
tweet    #fingerprint #Pregnancy Test https://goo.gl/h1...
Name: 0, dtype: object

This dataset has 3 features right now. 

1. __id:__ tweet id number, unique for every tweet
2. __label:__ 1 for negative tweet and 0 for positive or neutral tweet
3. __tweet:__ text data

We will create new features from the feature "tweet".


### 1. Word Count Feature

In [6]:
# number of words/terms in the tweets
tweets['word_count'] = [len(i.split()) for i in tweets['tweet']]

In [18]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17
2,3,0,We love this! Would you go? #talk #makememorie...,15
3,4,0,I'm wired I know I'm George I was made that wa...,17
4,5,1,What amazing service! Apple won't even talk to...,23


As you can see, we have a new feature __word_count__. Now let's create a feature of number of spaces in the tweets.

### 2. Space Count Feature

In [19]:
tweets['space_count'] = [i.count(' ') for i in tweets['tweet']]

In [20]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16
2,3,0,We love this! Would you go? #talk #makememorie...,15,14
3,4,0,I'm wired I know I'm George I was made that wa...,17,16
4,5,1,What amazing service! Apple won't even talk to...,23,22


### 3. Character Count Feature

In [23]:
tweets['character_count'] = [len(i) - i.count(' ') for i in tweets['tweet']]

In [26]:
tweets['character_count1']=[len(i)- i.count(' ') for i in tweets['tweet']]

In [27]:
tweets.head(4)

Unnamed: 0,id,label,tweet,word_count,space_count,character_count1,character_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116,116
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115,115
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109,109
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96,96


### 4. Average Word Length Feature

In [33]:
avg_word_length = []

# nested for loop
for i in tweets['tweet']:
    word_lengths = []
    for j in i.split():
        # length of terms in a tweet
        word_lengths.append(len(j))
    
    # average word length of a tweet
    l = sum(word_lengths)/len(word_lengths)
    
    avg_word_length.append(l)

In [34]:
avg_word_length

[8.923076923076923,
 6.764705882352941,
 7.266666666666667,
 5.647058823529412,
 4.434782608695652,
 5.0,
 8.181818181818182,
 7.666666666666667,
 8.357142857142858,
 8.214285714285714,
 4.0,
 4.857142857142857,
 6.3125,
 6.052631578947368,
 6.111111111111111,
 9.333333333333334,
 15.176470588235293,
 10.090909090909092,
 8.0,
 4.2727272727272725,
 5.3428571428571425,
 5.473684210526316,
 4.0,
 5.0,
 8.071428571428571,
 4.333333333333333,
 3.888888888888889,
 3.9642857142857144,
 6.875,
 7.176470588235294,
 9.333333333333334,
 8.461538461538462,
 7.866666666666666,
 4.176470588235294,
 7.083333333333333,
 4.25,
 5.0,
 5.25,
 4.36,
 10.636363636363637,
 8.0,
 9.137931034482758,
 8.818181818181818,
 9.5,
 9.272727272727273,
 10.0,
 4.0,
 6.214285714285714,
 8.5,
 6.444444444444445,
 6.8,
 7.6,
 7.857142857142857,
 7.6,
 4.857142857142857,
 3.607142857142857,
 10.333333333333334,
 4.488888888888889,
 4.0,
 6.0,
 6.9411764705882355,
 6.8,
 4.7727272727272725,
 4.842105263157895,
 8.1333333

In [35]:
# create new feature 
tweets['average_word_length'] = avg_word_length

# Build Model

In [36]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count,character_count1,character_count,average_word_length
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116,116,8.923077
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115,115,6.764706
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109,109,7.266667
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96,96,5.647059
4,5,1,What amazing service! Apple won't even talk to...,23,22,102,102,4.434783


In [39]:
X = tweets[['word_count', 'space_count', 'character_count', 'average_word_length']]
y = tweets['label']

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler # for standardization

In [40]:
# split dataset into train and test set
xtrain, xtest, ytrain, ytest = train_test_split(StandardScaler().fit_transform(X), y, 
                                                test_size=0.33, random_state=42)

In [41]:
xtrain.shape, xtest.shape

((5306, 4), (2614, 4))

In [42]:
# fit model
lr = LogisticRegression()
lr.fit(xtrain, ytrain)

LogisticRegression()

In [43]:
# predict on test set
preds = lr.predict_proba(xtest)

In [44]:
preds

array([[0.92294669, 0.07705331],
       [0.59967747, 0.40032253],
       [0.9516382 , 0.0483618 ],
       ...,
       [0.22800467, 0.77199533],
       [0.57410116, 0.42589884],
       [0.85136928, 0.14863072]])

In [45]:
roc_auc_score(ytest, preds[:,1])

0.8634997421167766