-----------------------

# <center><font color='#800000'>**Word2Vec and Average Word2Vec** </font></center>

-----------------------

In [12]:
# Imports
# =======

import pandas as pd, numpy as np

#Data cleaning and preprocessing
import re

# NLTK Imports
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()                # Creating ojects
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()      # Creating ojects

# Sklearn Imports
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Gensim Imports
import gensim
import gensim.downloader as api
from gensim.utils import simple_preprocess

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
wv = api.load('word2vec-google-news-300')

--------------------------
## **Methodology**
--------------------------

### `Step 1 - Reading the Data & Exploring it a bit`

### `Step 2 - Text Preprocessing 1`
- Tokenization
- Stopwords handling (Removal)
- Stemming
- Lemmatization

### `Step 3 - Text Preprocessing 2`
- Bag of Words
- TF-IFD
- Word2Vec
- AvgWord2Vec

### `Step 4 - Machine Learning Modelling`

--------------------------


In [32]:
# Importing the Data
# ==================
messages = pd.read_csv('../Data/smsspamcollection/SMSSpamCollection', 
                       sep='\t',
                       names=["label", "message"])
display(messages.shape)
display(messages.head())

(5572, 2)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
# Checking the "label" counts
messages['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [34]:
# Checking for a random word(s)
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [35]:
# Converting labels to 0s and 1s
# messages['label'] = messages['label'].apply(lambda x: 1 if x=='ham' else 0)

In [37]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [38]:
print(corpus[0])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


### **Training with Naive Bayes (Multinomial) Model**

In [57]:
# Creating the Bag of Words model
# ===============================
# Using the top 2500 features
cv = sklearn.feature_extraction.text.CountVectorizer(max_features=2500, binary=True, ngram_range=(1,2))  

X = cv.fit_transform(corpus).toarray()
print("Shape of X is : ", X.shape)
print("X")
print(X)

# Dummy Encoding values

y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values
print("\ny\n: ",y[:20])

Shape of X is :  (5572, 2500)
X
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

y
:  [False False  True False False  True False False  True  True False  True
  True False False  True False False False  True]


In [58]:
# Train Test Split
# ----------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify=y)

# Training using Naive Bayes Model
# --------------------------------
spam_detect_model = sklearn.naive_bayes.MultinomialNB().fit(X_train, y_train)

# Prediction on Test Set
y_pred=spam_detect_model.predict(X_test)
score=accuracy_score(y_test,y_pred).round(4)
print("Accuracy of the Model: ",score)
print()
# Checking the Classification Report
print(classification_report(y_pred,y_test))

Accuracy of the Model:  0.9821

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       972
        True       0.91      0.95      0.93       143

    accuracy                           0.98      1115
   macro avg       0.95      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### **Training with TFIDF-Vectorizer**

In [56]:
# Defining the TFIDF model object
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(max_features=2500)
# Fitting the TFIDF
X = tfidf.fit_transform(corpus).toarray()

# Defining the Naive Bayes Object
spam_detect_model = sklearn.naive_bayes.MultinomialNB().fit(X_train, y_train)

# Prediction on the Test set
y_pred=spam_detect_model.predict(X_test)

# Printing the Metrics
score=accuracy_score(y_test,y_pred).round(4)
print("Accuracy of the Model: ",score)
# Checking the Classification Report
print(classification_report(y_pred,y_test))

Accuracy of the Model:  0.9785
              precision    recall  f1-score   support

       False       0.99      0.98      0.99       972
        True       0.90      0.94      0.92       143

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115



### **Training with Random Forest**

In [62]:
# Defining the TFIDF model object
tfidf = sklearn.feature_extraction.text.TfidfVectorizer(max_features=2500, ngram_range=(1,2))
# Fitting the TFIDF
X = tfidf.fit_transform(corpus).toarray()

# Defining the Random Forest
from sklearn.ensemble import RandomForestClassifier
spam_detect_model = RandomForestClassifier().fit(X_train, y_train)

# Prediction on the Test set
y_pred=spam_detect_model.predict(X_test)

# Printing the Metrics
score=accuracy_score(y_test,y_pred).round(4)
print("Accuracy of the Model: ",score)
# Checking the Classification Report
print(classification_report(y_pred,y_test))

Accuracy of the Model:  0.9767
              precision    recall  f1-score   support

       False       1.00      0.98      0.99       988
        True       0.84      0.98      0.91       127

    accuracy                           0.98      1115
   macro avg       0.92      0.98      0.95      1115
weighted avg       0.98      0.98      0.98      1115



------------------------------

## Word2vec Implementation

In [348]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
print(corpus[0])    

In [351]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))
display(words)        

#### **Training word2vec from Scratch**


In [464]:
### Lets train Word2vec from scratch
# ----------------------------------
model=gensim.models.Word2Vec(words,window=5,min_count=2)

In [465]:
print(model.wv.index_to_key)

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [466]:
print(model.corpus_count)

5564

In [467]:
print(model.epochs)

5

In [468]:
model.wv.similar_by_word('kid')

[('work', 0.9972211122512817),
 ('much', 0.997178316116333),
 ('money', 0.9970908164978027),
 ('like', 0.9970575571060181),
 ('went', 0.997051477432251),
 ('going', 0.9970475435256958),
 ('sent', 0.9970418810844421),
 ('hi', 0.9970265030860901),
 ('oh', 0.9970120191574097),
 ('tomorrow', 0.9970046281814575)]

In [469]:
model.wv['kid'].shape

(100,)

In [452]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)
        
    
    

In [55]:
!pip install tqdm



In [364]:
from tqdm import tqdm

In [367]:
words[73]

['performed']

In [379]:
type(model.wv.index_to_key)

list

In [453]:
#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    print("Hello",i)
    X.append(avg_word2vec(words[i]))

    

  6%|████▉                                                                        | 360/5564 [00:00<00:01, 3599.19it/s]

Hello 0
Hello 1
Hello 2
Hello 3
Hello 4
Hello 5
Hello 6
Hello 7
Hello 8
Hello 9
Hello 10
Hello 11
Hello 12
Hello 13
Hello 14
Hello 15
Hello 16
Hello 17
Hello 18
Hello 19
Hello 20
Hello 21
Hello 22
Hello 23
Hello 24
Hello 25
Hello 26
Hello 27
Hello 28
Hello 29
Hello 30
Hello 31
Hello 32
Hello 33
Hello 34
Hello 35
Hello 36
Hello 37
Hello 38
Hello 39
Hello 40
Hello 41
Hello 42
Hello 43
Hello 44
Hello 45
Hello 46
Hello 47
Hello 48
Hello 49
Hello 50
Hello 51
Hello 52
Hello 53
Hello 54
Hello 55
Hello 56
Hello 57
Hello 58
Hello 59
Hello 60
Hello 61
Hello 62
Hello 63
Hello 64
Hello 65
Hello 66
Hello 67
Hello 68
Hello 69
Hello 70
Hello 71
Hello 72
Hello 73
Hello 74
Hello 75
Hello 76
Hello 77
Hello 78
Hello 79
Hello 80
Hello 81
Hello 82
Hello 83
Hello 84
Hello 85
Hello 86
Hello 87
Hello 88
Hello 89
Hello 90
Hello 91
Hello 92
Hello 93
Hello 94
Hello 95
Hello 96
Hello 97
Hello 98
Hello 99
Hello 100
Hello 101
Hello 102
Hello 103
Hello 104
Hello 105
Hello 106
Hello 107
Hello 108
Hello 109
Hello 110


 21%|████████████████▎                                                           | 1192/5564 [00:00<00:01, 3694.12it/s]

 712
Hello 713
Hello 714
Hello 715
Hello 716
Hello 717
Hello 718
Hello 719
Hello 720
Hello 721
Hello 722
Hello 723
Hello 724
Hello 725
Hello 726
Hello 727
Hello 728
Hello 729
Hello 730
Hello 731
Hello 732
Hello 733
Hello 734
Hello 735
Hello 736
Hello 737
Hello 738
Hello 739
Hello 740
Hello 741
Hello 742
Hello 743
Hello 744
Hello 745
Hello 746
Hello 747
Hello 748
Hello 749
Hello 750
Hello 751
Hello 752
Hello 753
Hello 754
Hello 755
Hello 756
Hello 757
Hello 758
Hello 759
Hello 760
Hello 761
Hello 762
Hello 763
Hello 764
Hello 765
Hello 766
Hello 767
Hello 768
Hello 769
Hello 770
Hello 771
Hello 772
Hello 773
Hello 774
Hello 775
Hello 776
Hello 777
Hello 778
Hello 779
Hello 780
Hello 781
Hello 782
Hello 783
Hello 784
Hello 785
Hello 786
Hello 787
Hello 788
Hello 789
Hello 790
Hello 791
Hello 792
Hello 793
Hello 794
Hello 795
Hello 796
Hello 797
Hello 798
Hello 799
Hello 800
Hello 801
Hello 802
Hello 803
Hello 804
Hello 805
Hello 806
Hello 807
Hello 808
Hello 809
Hello 810
Hello 811
Hello

 35%|██████████████████████████▊                                                 | 1961/5564 [00:00<00:00, 3785.71it/s]

Hello 1399
Hello 1400
Hello 1401
Hello 1402
Hello 1403
Hello 1404
Hello 1405
Hello 1406
Hello 1407
Hello 1408
Hello 1409
Hello 1410
Hello 1411
Hello 1412
Hello 1413
Hello 1414
Hello 1415
Hello 1416
Hello 1417
Hello 1418
Hello 1419
Hello 1420
Hello 1421
Hello 1422
Hello 1423
Hello 1424
Hello 1425
Hello 1426
Hello 1427
Hello 1428
Hello 1429
Hello 1430
Hello 1431
Hello 1432
Hello 1433
Hello 1434
Hello 1435
Hello 1436
Hello 1437
Hello 1438
Hello 1439
Hello 1440
Hello 1441
Hello 1442
Hello 1443
Hello 1444
Hello 1445
Hello 1446
Hello 1447
Hello 1448
Hello 1449
Hello 1450
Hello 1451
Hello 1452
Hello 1453
Hello 1454
Hello 1455
Hello 1456
Hello 1457
Hello 1458
Hello 1459
Hello 1460
Hello 1461
Hello 1462
Hello 1463
Hello 1464
Hello 1465
Hello 1466
Hello 1467
Hello 1468
Hello 1469
Hello 1470
Hello 1471
Hello 1472
Hello 1473
Hello 1474
Hello 1475
Hello 1476
Hello 1477
Hello 1478
Hello 1479
Hello 1480
Hello 1481
Hello 1482
Hello 1483
Hello 1484
Hello 1485
Hello 1486
Hello 1487
Hello 1488
Hello 1489

 49%|█████████████████████████████████████▏                                      | 2718/5564 [00:00<00:00, 3742.37it/s]

Hello 2110
Hello 2111
Hello 2112
Hello 2113
Hello 2114
Hello 2115
Hello 2116
Hello 2117
Hello 2118
Hello 2119
Hello 2120
Hello 2121
Hello 2122
Hello 2123
Hello 2124
Hello 2125
Hello 2126
Hello 2127
Hello 2128
Hello 2129
Hello 2130
Hello 2131
Hello 2132
Hello 2133
Hello 2134
Hello 2135
Hello 2136
Hello 2137
Hello 2138
Hello 2139
Hello 2140
Hello 2141
Hello 2142
Hello 2143
Hello 2144
Hello 2145
Hello 2146
Hello 2147
Hello 2148
Hello 2149
Hello 2150
Hello 2151
Hello 2152
Hello 2153
Hello 2154
Hello 2155
Hello 2156
Hello 2157
Hello 2158
Hello 2159
Hello 2160
Hello 2161
Hello 2162
Hello 2163
Hello 2164
Hello 2165
Hello 2166
Hello 2167
Hello 2168
Hello 2169
Hello 2170
Hello 2171
Hello 2172
Hello 2173
Hello 2174
Hello 2175
Hello 2176
Hello 2177
Hello 2178
Hello 2179
Hello 2180
Hello 2181
Hello 2182
Hello 2183
Hello 2184
Hello 2185
Hello 2186
Hello 2187
Hello 2188
Hello 2189
Hello 2190
Hello 2191
Hello 2192
Hello 2193
Hello 2194
Hello 2195
Hello 2196
Hello 2197
Hello 2198
Hello 2199
Hello 2200

 63%|███████████████████████████████████████████████▉                            | 3505/5564 [00:00<00:00, 3823.43it/s]

 2806
Hello 2807
Hello 2808
Hello 2809
Hello 2810
Hello 2811
Hello 2812
Hello 2813
Hello 2814
Hello 2815
Hello 2816
Hello 2817
Hello 2818
Hello 2819
Hello 2820
Hello 2821
Hello 2822
Hello 2823
Hello 2824
Hello 2825
Hello 2826
Hello 2827
Hello 2828
Hello 2829
Hello 2830
Hello 2831
Hello 2832
Hello 2833
Hello 2834
Hello 2835
Hello 2836
Hello 2837
Hello 2838
Hello 2839
Hello 2840
Hello 2841
Hello 2842
Hello 2843
Hello 2844
Hello 2845
Hello 2846
Hello 2847
Hello 2848
Hello 2849
Hello 2850
Hello 2851
Hello 2852
Hello 2853
Hello 2854
Hello 2855
Hello 2856
Hello 2857
Hello 2858
Hello 2859
Hello 2860
Hello 2861
Hello 2862
Hello 2863
Hello 2864
Hello 2865
Hello 2866
Hello 2867
Hello 2868
Hello 2869
Hello 2870
Hello 2871
Hello 2872
Hello 2873
Hello 2874
Hello 2875
Hello 2876
Hello 2877
Hello 2878
Hello 2879
Hello 2880
Hello 2881
Hello 2882
Hello 2883
Hello 2884
Hello 2885
Hello 2886
Hello 2887
Hello 2888
Hello 2889
Hello 2890
Hello 2891
Hello 2892
Hello 2893
Hello 2894
Hello 2895
Hello 2896
Hell


 70%|█████████████████████████████████████████████████████▎                      | 3900/5564 [00:01<00:00, 3861.80it/s]

Hello 3529
Hello 3530
Hello 3531
Hello 3532
Hello 3533
Hello 3534
Hello 3535
Hello 3536
Hello 3537
Hello 3538
Hello 3539
Hello 3540
Hello 3541
Hello 3542
Hello 3543
Hello 3544
Hello 3545
Hello 3546
Hello 3547
Hello 3548
Hello 3549
Hello 3550
Hello 3551
Hello 3552
Hello 3553
Hello 3554
Hello 3555
Hello 3556
Hello 3557
Hello 3558
Hello 3559
Hello 3560
Hello 3561
Hello 3562
Hello 3563
Hello 3564
Hello 3565
Hello 3566
Hello 3567
Hello 3568
Hello 3569
Hello 3570
Hello 3571
Hello 3572
Hello 3573
Hello 3574
Hello 3575
Hello 3576
Hello 3577
Hello 3578
Hello 3579
Hello 3580
Hello 3581
Hello 3582
Hello 3583
Hello 3584
Hello 3585
Hello 3586
Hello 3587
Hello 3588
Hello 3589
Hello 3590
Hello 3591
Hello 3592
Hello 3593
Hello 3594
Hello 3595
Hello 3596
Hello 3597
Hello 3598
Hello 3599
Hello 3600
Hello 3601
Hello 3602
Hello 3603
Hello 3604
Hello 3605
Hello 3606
Hello 3607
Hello 3608
Hello 3609
Hello 3610
Hello 3611
Hello 3612
Hello 3613
Hello 3614
Hello 3615
Hello 3616
Hello 3617
Hello 3618
Hello 3619

 84%|███████████████████████████████████████████████████████████████▉            | 4679/5564 [00:01<00:00, 3850.50it/s]

Hello 4266
Hello 4267
Hello 4268
Hello 4269
Hello 4270
Hello 4271
Hello 4272
Hello 4273
Hello 4274
Hello 4275
Hello 4276
Hello 4277
Hello 4278
Hello 4279
Hello 4280
Hello 4281
Hello 4282
Hello 4283
Hello 4284
Hello 4285
Hello 4286
Hello 4287
Hello 4288
Hello 4289
Hello 4290
Hello 4291
Hello 4292
Hello 4293
Hello 4294
Hello 4295
Hello 4296
Hello 4297
Hello 4298
Hello 4299
Hello 4300
Hello 4301
Hello 4302
Hello 4303
Hello 4304
Hello 4305
Hello 4306
Hello 4307
Hello 4308
Hello 4309
Hello 4310
Hello 4311
Hello 4312
Hello 4313
Hello 4314
Hello 4315
Hello 4316
Hello 4317
Hello 4318
Hello 4319
Hello 4320
Hello 4321
Hello 4322
Hello 4323
Hello 4324
Hello 4325
Hello 4326
Hello 4327
Hello 4328
Hello 4329
Hello 4330
Hello 4331
Hello 4332
Hello 4333
Hello 4334
Hello 4335
Hello 4336
Hello 4337
Hello 4338
Hello 4339
Hello 4340
Hello 4341
Hello 4342
Hello 4343
Hello 4344
Hello 4345
Hello 4346
Hello 4347
Hello 4348
Hello 4349
Hello 4350
Hello 4351
Hello 4352
Hello 4353
Hello 4354
Hello 4355
Hello 4356

100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:01<00:00, 3833.38it/s]

 4978
Hello 4979
Hello 4980
Hello 4981
Hello 4982
Hello 4983
Hello 4984
Hello 4985
Hello 4986
Hello 4987
Hello 4988
Hello 4989
Hello 4990
Hello 4991
Hello 4992
Hello 4993
Hello 4994
Hello 4995
Hello 4996
Hello 4997
Hello 4998
Hello 4999
Hello 5000
Hello 5001
Hello 5002
Hello 5003
Hello 5004
Hello 5005
Hello 5006
Hello 5007
Hello 5008
Hello 5009
Hello 5010
Hello 5011
Hello 5012
Hello 5013
Hello 5014
Hello 5015
Hello 5016
Hello 5017
Hello 5018
Hello 5019
Hello 5020
Hello 5021
Hello 5022
Hello 5023
Hello 5024
Hello 5025
Hello 5026
Hello 5027
Hello 5028
Hello 5029
Hello 5030
Hello 5031
Hello 5032
Hello 5033
Hello 5034
Hello 5035
Hello 5036
Hello 5037
Hello 5038
Hello 5039
Hello 5040
Hello 5041
Hello 5042
Hello 5043
Hello 5044
Hello 5045
Hello 5046
Hello 5047
Hello 5048
Hello 5049
Hello 5050
Hello 5051
Hello 5052
Hello 5053
Hello 5054
Hello 5055
Hello 5056
Hello 5057
Hello 5058
Hello 5059
Hello 5060
Hello 5061
Hello 5062
Hello 5063
Hello 5064
Hello 5065
Hello 5066
Hello 5067
Hello 5068
Hell




In [454]:
type(X)

list

In [455]:
X_new=np.array(X)

  X_new=np.array(X)


In [456]:
X_new[3]

array([-0.13515173,  0.37618315,  0.19070846,  0.02991627,  0.04826126,
       -0.41647243,  0.11560273,  0.66422325, -0.2143672 , -0.19525246,
       -0.1978392 , -0.43449304, -0.00917046,  0.12552214,  0.09977287,
       -0.35573253,  0.0294413 , -0.4246745 , -0.01977133, -0.56335396,
        0.08949187,  0.19041474,  0.12510803, -0.16838486, -0.14186232,
        0.04411262, -0.2714059 , -0.1799921 , -0.27904728,  0.07080126,
        0.38256872,  0.07004831,  0.1977586 , -0.29876754, -0.15707694,
        0.3415544 ,  0.0428506 , -0.24809606, -0.22363473, -0.48549023,
        0.076442  , -0.2930331 , -0.11361477,  0.05708167,  0.32048273,
       -0.16589147, -0.2346162 ,  0.03779726,  0.18577482,  0.30356586,
        0.21503724, -0.3583407 , -0.07625987, -0.02272328, -0.16456175,
        0.2691315 ,  0.21715061, -0.05291209, -0.34251794,  0.079744  ,
        0.13134973,  0.15683769, -0.1648535 , -0.0069424 , -0.34232315,
        0.20215316,  0.11598274,  0.24336684, -0.36031055,  0.42

In [459]:
X_new.shape

(5564,)