In [None]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
# download Punkt Sentence Tokenizer
nltk.download('punkt')
# download stopwords
nltk.download('stopwords')

In [None]:
nltk.download('omw-1.4')

In [None]:
!pip install openpyxl
df = pd.read_excel('../input/new-classificsation/Text Data.xlsx', usecols="A:D")
df.head(10)

In [None]:
df.shape
df.info()

In [None]:
df1 = pd.read_excel('../input/new-classificsation/Text Data.xlsx', usecols= 'A:C')
df1

In [None]:
print(df1['subject'].value_counts())

In [None]:
subject = df1['subject'].unique()
print(subject)

In [None]:
df1 = df1.loc[(df1['subject'] == 'politicsNews') | (df1['subject'] == 'worldnews') | (df1['subject'] == 'politics') | (df1['subject'] == 'left-news') | (df1['subject'] == 'Government News') | (df1['subject'] == 'US_News') | (df1['subject'] == 'Middle-east')]

In [None]:
subject = df1['subject'].unique()
print(subject)

In [None]:
df1['subject'].value_counts()

In [None]:
df1['subjectid'] = df1['subject'].factorize()[0]
df1.head()

In [None]:
subject= df1[['subject', 'subjectid']].drop_duplicates().sort_values('subjectid')
subject

In [None]:
subject = df1['subject']
subject.head()

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stopWords = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean(text):

  #Normalize the data | text to lower case
  text = text.lower()

  #Remove none alphabetic
  text = re.sub(r'[^a-z0-9!?]', ' ', text)

  # stem words 
  # split into words
  tokens = word_tokenize(text) #from a sentence to an array of words

  # stemming of words
  stemmed = [stemmer.stem(word) for word in tokens] #an array containing the stemmed words

  text = ' '.join(stemmed) #reconstruct the text

  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in stopWords])

  return text


In [None]:
# apply to all dataset
df1['text'] = df1['text'].apply(clean)


print('Cleaned  :', df1['text'][1])

In [None]:
df1

In [None]:
#Spliting the data
from sklearn.model_selection import train_test_split
x = df1['text'].values
y = df1['subjectid'].values

from sklearn.feature_extraction.text import CountVectorizer
x = np.array(df1.iloc[:,0].values)
y = np.array(df1.subjectid.values)
cv = CountVectorizer(max_features = 5000)
x = cv.fit_transform(df1.text).toarray()
print("X.shape = ",x.shape)
print("y.shape = ",y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0, shuffle = True)
print(len(x_train))
print(len(x_test))

In [None]:
perform_list = [ ]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [None]:
def run_model(model_name, est_c, est_pnlty):

  mdl=''

  if model_name == 'Logistic Regression':

    mdl = LogisticRegression()

  elif model_name == 'Random Forest':

    mdl = RandomForestClassifier(n_estimators=100 ,criterion='entropy' , random_state=0)

  elif model_name == 'Multinomial Naive Bayes':

    mdl = MultinomialNB(alpha=1.0,fit_prior=True)

  elif model_name == 'Support Vector Classifer':

   mdl = SVC()

  elif model_name == 'Decision Tree Classifier':

    mdl = DecisionTreeClassifier()

  elif model_name == 'K Nearest Neighbour':

    mdl = KNeighborsClassifier(n_neighbors=10 , metric= 'minkowski' , p = 4)

  elif model_name == 'Gaussian Naive Bayes':

    mdl = GaussianNB()

  oneVsRest = OneVsRestClassifier(mdl)

  oneVsRest.fit(x_train, y_train)

  y_pred = oneVsRest.predict(x_test)

  # Performance metrics

  accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)

  # Get precision, recall, f1 scores

  precision, recall, f1score, support = score(y_test, y_pred, average='micro')

  print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')

  print(f'Precision : {precision}')

  print(f'Recall : {recall}')

  print(f'F1-score : {f1score}')

  # Add performance parameters to list

  perform_list.append(dict([

  ('Model', model_name),

  ('Test Accuracy', round(accuracy, 2)),

  ('Precision', round(precision, 2)),

  ('Recall', round(recall, 2)),

  ('F1', round(f1score, 2))

  ]))

In [None]:
run_model('Logistic Regression', est_c=None, est_pnlty=None)

In [None]:
run_model('Random Forest', est_c=None, est_pnlty=None)

In [None]:
run_model('Multinomial Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
#run_model('Support Vector Classifer', est_c=None, est_pnlty=None)

In [None]:
run_model('Decision Tree Classifier', est_c=None, est_pnlty=None)

In [None]:
#run_model('K Nearest Neighbour', est_c=None, est_pnlty=None)

In [None]:
run_model('Gaussian Naive Bayes', est_c=None, est_pnlty=None)

In [None]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

In [None]:
model = model_performance["Model"]
max_value = model_performance["Test Accuracy"].max()
print("The best accuracy of model is", max_value,"Multinomail Naive Bayes")

In [None]:
classifier = MultinomialNB(alpha=1.0,fit_prior=True).fit(x_train, y_train)
classifier
y_pred = classifier.predict(x_test)

In [None]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(x_train, y_train, verbose=False)

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
predictions = my_model.predict(x_test)
print("Mean Absolute Error : " + str(mean_absolute_error(y_pred, y_test)))
print('Accuracy Score : ' + str(accuracy_score(y_pred, y_test)))


In [None]:
def news_result(y):
  result = ""
  if y == [0]:
    result = "Politics News"
  elif y == [1]:
    result = "World News"
  elif y == [2]:
    result = "Politics News"
  elif y == [3]:
    result = "Government News"
  elif y == [4]:
    result = "Left News"
  elif y == [5]:
    result = "US News"
  elif y == [6]:
    result = "Middle East News"
  print(result)

In [None]:
y_pred1 = cv.transform(['Hour ago, I contemplated retirement for a lot of reasons. I felt like people were not sensitive enough to my injuries. I felt like a lot of people were backed, why not me? I have done no less. I have won a lot of games for the team, and I am not feeling backed, said Ashwin'])
yy = classifier.predict(y_pred1)
news_result(yy)

In [None]:
y_pred2 = cv.transform(['In his final act of putting Americans dead last, John Boehner will stand with Democrats in their rabid desire to keep the abortion industry humming. Because government funding for baby part harvesting businesses is exactly where American taxpayers want to see their hard earned money being spent. Well done John .U.S. House Speaker John Boehner vowed on Sunday that Congress will avoid a government shutdown this week and he would push through as much unfinished legislation as possible before leaving at the end of October.Speaking on CBS  Face the Nation two days after his surprise resignation, he said the House this week would pass a government funding bill now moving through the Senate, which does not meet conservatives  demands to cut off money for Planned Parenthood.Asked if passage would require Democratic votes, he responded:  I m sure it will, but I suspect my Democratic colleagues want to keep the government open as much as I do. Boehner resigned amid deep divisions among House Republicans over a range of issues including the handling of a Sept. 30 deadline to approve new funding for federal agencies and programs from education to national parks.Conservative Republicans, some of whom have called for his ouster, have insisted on punishing Planned Parenthood by withholding funds over allegations that the non-profit group improperly sold tissues harvested from aborted fetuses.But Boehner and more moderate Republicans have argued that such a move would not halt the bulk of the group s $500 billion in funding, which comes from reimbursements through the Medicaid healthcare program for the poor. Via: Reuters'])
y2 = classifier.predict(y_pred2)
news_result(y2)

In [None]:
y_pred3 = cv.transform(['THE TRUTH About Alicia Machado BLOWS UPâ€¦Backfires BIG-TIME On Hillaryâ€™s Dirty Campaign! [Video]'])
y3 = classifier.predict(y_pred3)
news_result(y3)

In [None]:
y_pred4 = cv.transform(['Oops! Hillary and her race-baiting campaign team are NOT going to want the Black community to see this video Donald Trump doesn t want to give the Black and minority communities a hand-out he wants to give them self-respecting JOBS. He wants to see every American reach their full potential regardless of the color of their skin. This is a concept so foreign to the Democrat Party that the only response they re able to come up with is falsely accusing Trump of being a  racist  and hoping it sticks. For decades, the Democrats have been able to get away with falsely labeling Republicans But Donald Trump is NOT your average Republican, and he s about to bring down the Democrats false narrative like a house of cards.Enjoy:https://youtu.be/7U6Pp5iflTs '])
y4 = classifier.predict(y_pred4)
news_result(y4)