In [None]:
import pandas as pd
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)
import plotly.express as px
import numpy as np
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import plotly
import plotly.graph_objects as go


le= LabelEncoder()


In [None]:
data= pd.read_csv('data/cleaned_news_dataset.csv')
data=data[['file_name','category','content3']].copy()
data

In [None]:
data

In [None]:
# Y= np.ravel(data.drop(['content3','file_name','category'],axis=1))
Y= data.drop(['content3','file_name'],axis=1)


In [None]:
# X= np.ravel(data['content3'])
X= data['content3']

In [None]:
train_x,val_x,train_y,val_y= train_test_split(X,Y,test_size=0.15,random_state=8)

## 
We have to define the different parameters:

    ngram_range: We want to consider both unigrams and bigrams.
    max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
    min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    max_features: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.


In [None]:
train_x.to_pickle('data/train_x_text.pickle')
val_x.to_pickle('data/val_x_text.pickle')
train_y.to_pickle('data/train_y_text.pickle')
val_y.to_pickle('data/val_y_text.pickle')

In [None]:
print(len(train_y))
len(val_y)

## Finding the most word frequencies according to category

In [None]:
news=data['content3'].values
target=data['category'].values
dic={}
for i,text in enumerate(news):
    val_count= pd.Series(text.split(' ')).value_counts()
    for word in val_count.index:
        if (word,target[i]) not in dic.keys():
            dic[(word,target[i])]= val_count[word]
        if (word,target[i]) in dic.keys():
            dic[(word,target[i])]= dic[(word,target[i])] + val_count[word]

In [None]:
sorted(dic.items(),key= lambda x: x[1],reverse=True)

In [None]:
list(dic.items())[0]

In [None]:
words= [word[0][0] for word in list(dic.items())]
category= [word[0][1] for word in list(dic.items())]
counts= [word[1] for word in list(dic.items())]

df= pd.DataFrame({'Words':words,'category':category,'counts':counts})
df

In [None]:
df_g= df.groupby('category')
business= df_g.get_group('business')
tech= df_g.get_group('tech')
entertainment= df_g.get_group('entertainment')
sport = df_g.get_group('sport')
politics = df_g.get_group('politics')

In [None]:
best_words= business[business['counts'].values>100]
best_words.sort_values(by='counts',ascending=False)

In [None]:
fig= go.Figure()
cats=['business','tech','entertainment','sport','politics']

num= 40
for i,category in enumerate([business,tech,entertainment,sport,politics]):
    best_words= category[category['counts'].values>100].sort_values(by='counts',ascending=False)
    fig.add_trace(
        go.Scatter(
                    x= best_words.index,
                    y= best_words['counts'].values,
                    mode='markers',
                    name=cats[i],
                    hovertext=category['Words']
        )
    )
fig.update_layout(
                xaxis={
                    'tickmode':'array',
                    'tickvals':[10,9000,17000,25000,33000],
                      'ticktext':['business', 'entertainment', 'politics', 'sport', 'tech']
                },
                title='most frequent words in all categories',
                xaxis_title='Category',
                yaxis_title='count'
                    
)
    
    

fig.show()


In [None]:
data['target']= le.fit_transform(data['category'])
le.transform(['business', 'entertainment', 'politics', 'sport', 'tech'])

## Convert a collection of raw documents to a matrix of TF-IDF features.
we can make a vector using CountVectorizer also but Many Youtube Scientist says tfIDF vectorizer is better, let's find out :P

In [None]:
n_range=(1,2)
max_df=1.
min_df=10
max_features=300
tf= TfidfVectorizer(
    ngram_range= n_range, 
    max_df= max_df, 
    min_df= min_df,
    max_features= max_features
    )
train = tf.fit_transform(train_x).todense()
validation= tf.transform(val_x).todense()


In [None]:
df=pd.DataFrame(train,columns= tf.get_feature_names())

In [None]:
display(train)
validation

In [None]:
df['target_category']= train_y

In [None]:
cats=['business', 'tech', 'entertainment', 'sport', 'politics']
fig= go.Figure()
for i in range(len(cats)):
    df_g= df.groupby('target_category').get_group(cats[i])
    cols=list(df_g.columns)[:-1]
    counts= df_g.loc[0:,cols].sum(axis=0)
    
    fig.add_trace(
        go.Scatter(
                    x= counts.index[0:],
                    y= np.log(counts.values)[0:],
                    mode='markers',
                    name= cats[i]
        )
    )
fig.show()

In [None]:
#finding relevant features
le.classes_
z= [(i,c) for i,c in enumerate(le.classes_)]
category_codes= dict(z)
category_codes.items()

In [None]:
from sklearn.feature_selection import chi2
for code,cat in category_codes.items():
    features_chi2= chi2(train,train_y==code)[0]
    s_values= np.argsort(features_chi2)
    
    feature_names= np.array(tf.get_feature_names())[s_values]
    
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    
    print(cat,':')
    print(' , '.join(unigrams[-6:]))
    print(' , '.join(bigrams[-5:]))
    print('-'*60)
    
    

In [None]:
data.to_csv('data/news_dataset_v3_FE.csv')

In [None]:
pd.DataFrame(train).to_pickle('data/train.pickle')
pd.DataFrame(train_y).to_pickle('data/train_y.pickle')

pd.DataFrame(validation).to_pickle('data/test.pickle')
pd.DataFrame(val_y).to_pickle('data/test_y.pickle')
