**Basic Text Features**

In [1]:
#sample text
text = "Dark matter is one of the greatest enigmas of astrophysics and cosmology"

Tokenisation

In [2]:
text.split(' ')

['Dark',
 'matter',
 'is',
 'one',
 'of',
 'the',
 'greatest',
 'enigmas',
 'of',
 'astrophysics',
 'and',
 'cosmology']

In [3]:
#seperating individual words 
words = text.split(' ')

In [4]:
#number of words
len(words)

12

In [5]:
#number of spaces
text.count(' ')

11

In [6]:
#character count
len(text) - text.count(' ')

61

Average word length

In [7]:
words_lengths = []
for i in text.split(' '):
  words_lengths.append(len(i))

words_lengths

[4, 6, 2, 3, 2, 3, 8, 7, 2, 12, 3, 9]

In [8]:
#average words lengths
sum(words_lengths)/len(words_lengths)

5.083333333333333

**Creating features for Dataset**

In [9]:
import pandas as pd


In [10]:
tweets = pd.read_csv('tweets.csv')
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


**1. Word Count Feature**

In [11]:
tweets['word_count'] = [len(i.split(' ')) for i in tweets['tweet']]
tweets.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17
2,3,0,We love this! Would you go? #talk #makememorie...,15
3,4,0,I'm wired I know I'm George I was made that wa...,17
4,5,1,What amazing service! Apple won't even talk to...,23


**2. Space Count Feature**

In [12]:
tweets['space_count'] = [ i.count(' ') for i in tweets['tweet']]
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16
2,3,0,We love this! Would you go? #talk #makememorie...,15,14
3,4,0,I'm wired I know I'm George I was made that wa...,17,16
4,5,1,What amazing service! Apple won't even talk to...,23,22


**3. Character Count Feature**

In [13]:
tweets['character_count'] = [ (len(i) - i.count(' ')) for i in tweets['tweet']]
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count,character_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96
4,5,1,What amazing service! Apple won't even talk to...,23,22,102


**4. Average length of words**

In [14]:
avg_word_length = []

for i in tweets['tweet']:
  words_length = []
  for j in i.split(' '):
    words_length.append(len(j))
  
  l = sum(words_length)/len(words_length)
  avg_word_length.append(l)

In [15]:
tweets['avg_word_lengths'] = avg_word_length
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count,character_count,avg_word_lengths
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116,8.923077
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115,6.764706
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109,7.266667
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96,5.647059
4,5,1,What amazing service! Apple won't even talk to...,23,22,102,4.434783


**Building model using Newly created features**

In [16]:
x = tweets[['word_count', 'space_count', 'character_count', 'avg_word_lengths']]
y = tweets['label']

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [18]:
##splitting the data
sc = StandardScaler()
sc.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 42)

In [19]:
x_train.shape, x_test.shape

((5306, 4), (2614, 4))

In [20]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [21]:
y_pred = lr.predict_proba(x_test)
y_pred

array([[0.92336015, 0.07663985],
       [0.60005301, 0.39994699],
       [0.95178064, 0.04821936],
       ...,
       [0.22660127, 0.77339873],
       [0.57452014, 0.42547986],
       [0.85191451, 0.14808549]])

In [22]:
roc_auc_score(y_test, y_pred[:, 1])

0.8635264196916073