# Importing Libraries

In [12]:
import nltk
nltk.download('all')

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import itertools 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import spacy
from spacy import displacy
nlp= spacy.load("en_core_web_sm")

# Data Preprocessing Function

In [None]:
def pre_processing(data_text):
    documents = []
    stemmer = WordNetLemmatizer()
    for sen in range(0, len(data_text)):
        document = re.sub(r'\W', ' ', str( data_text[sen]))
        document = re.sub(re.escape(string.punctuation), '', document)
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
        document = re.sub(r'\s+',' ',document, flags=re.I)
        document = re.sub(r'^b\s+', '',document)
        document = re.sub(r'^\s', '',document)
        document = re.sub(r'\s$', '',document)
        document = document.lower()

        document = document.split()
        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)
        document = [ word for word in document.split() if word not in stopwords.words("english")]
        document = ' '.join(document)
        
        if(re.search(r'^\s*$',document)!=None):
            continue
        if not document:
            continue

        documents.append(document)
    return documents

# Data Scraping Function

In [None]:
def url_script(url):
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'}
    page = requests.get(url,headers=headers).text
    soup = BeautifulSoup(page,'html.parser')
    text = [ p.text for p in soup.find_all('p')]
    print(url)
    return text

# List of URLs

In [None]:
urls=['https://www.investopedia.com/terms/d/data-analytics.asp',
      'https://www.mastersindatascience.org/learning/what-is-data-analytics/',
      'https://searchdatamanagement.techtarget.com/definition/data-analytics',
      'https://www.lotame.com/what-is-data-analytics/',
      'https://www.thinkful.com/blog/data-analytics-blogs/',
      'https://www.tibco.com/reference-center/what-is-data-analytics',
      'https://www.simplilearn.com/data-science-vs-big-data-vs-data-analytics-article',
      'https://www.ibm.com/analytics/hadoop/big-data-analytics',
      'https://www.teradata.com/Blogs/5-Big-Benefits-of-Data-and-Analytics-for-Positive-Business-Outcomes',
      'https://www.bmc.com/blogs/data-analytics-vs-data-analysis/',
      'https://www.accaglobal.com/in/en/student/exam-support-resources/professional-exams-study-resources/p7/technical-articles/data-analytics.html',
      'https://www.stitchdata.com/resources/benefits-of-data-analytics/',
      'https://www.qubole.com/big-data-analytics/',
      'https://www.dickinson.edu/homepage/1474/data_analytics',
      'https://www.dmu.ac.uk/study/courses/postgraduate-courses/data-analytics-msc-degree/data-analytics-msc-degrees.aspx',
      'https://www.statistics.com/data-analytics/',
      'https://www.scnsoft.com/blog/4-types-of-data-analytics',
      'https://www.dataversity.net/brief-history-analytics/',
      'https://www.sabanciuniv.edu/en/data-analytics',
      'https://www.northeastern.edu/graduate/blog/data-analytics-vs-data-science/',
      'https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/making-data-analytics-work-for-you-instead-of-the-other-way-around',
      'https://www.eweek.com/big-data-and-analytics/data-analytics-trends/',
      'https://corporatefinanceinstitute.com/resources/knowledge/other/data-analytics/',
      'https://chartio.com/learn/data-analytics/types-of-data-analysis/',
      'https://www.intel.com/content/www/us/en/analytics/what-is-data-analytics.html',
      'https://www.omnisci.com/learn/big-data-analytics',
      'https://www.lighthouselabs.ca/en/blog/the-five-stages-of-data-analysis',
      'https://www.bdo.com.au/en-au/services/advisory/consulting/data-analytics/what-is-data-analytics',
      'https://www.journalofaccountancy.com/issues/2016/aug/data-analytics-skills.html',
      'https://www.retail-insight-network.com/dashboards/data-analytics-hiring-levels-in-the-retail-industry-rose-in-october-2021/',
      'https://www.ironhack.com/en/data-analytics/data-science-data-analytics',
      'https://www.comptia.org/blog/best-data-analytics-certifications',
      'https://www.kenan-flagler.unc.edu/perspectives/why-data-analytics-matter-to-accountants/',
      'https://www.apm.org.uk/resources/what-is-project-management/what-is-project-data-analytics/',
      'https://www.techopedia.com/definition/26418/data-analytics',
      'https://www.clearrisk.com/risk-management-blog/challenges-of-data-analytics-0',
      'https://iterationinsights.com/article/where-to-start-with-the-4-types-of-analytics/',
      'https://www.packaging-gateway.com/uncategorised/data-analytics-hiring-levels-in-the-packaging-industry-rose-in-october-2021/',
      'https://www.naval-technology.com/analysis/data-analytics-hiring-levels-in-the-naval-industry-rose-in-october-2021/',
      'https://www.getsmarter.com/blog/career-advice/difference-data-analytics-data-analysis/',
     ]
raw_data=[]
cleaned_data=[]

# Loop to Scrap and Clean the Data

In [None]:
x=1
for i in urls:
    print(x)
    text=url_script(i)
    raw_text=[' '.join(text)]
    raw_data.extend(raw_text)
    clean_text=[' '.join(pre_processing(text))]
    cleaned_data.extend(clean_text)
    x+=1
print(raw_data)
    

# Comparing Length of Raw and Cleaned Data

In [None]:
print(len(raw_data))
print(len(cleaned_data))

# Creating Dataframe of Raw and Cleaned Data

In [None]:
df1 = pd.DataFrame()
df1['Raw Data']= raw_data
df1["Cleaned Data"]= cleaned_data

# Saving the DataFrame into a CSV File

In [None]:
df1.to_csv("clean_data.csv",index=False)

# Reading DataSet from the CSV File into a DataFrame

In [None]:
df2=pd.read_csv("clean_data.csv")

In [None]:
df2.head()

# Sentiment Analysis

In [None]:
TextBlob(df2["Cleaned Data"].iloc[0]).sentiment

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, ngram_range=(1,3),stop_words=stopwords.words('english'))
x = tfidfconverter.fit_transform(df2["Cleaned Data"]).toarray()
print(x)

In [None]:
tfidfconverter.get_feature_names()

In [None]:
len(tfidfconverter.get_feature_names())

# Finding Frequency Distribution of Dataset

In [None]:
df2['Cleaned Data'][:]

In [None]:
fdist = FreqDist()
for x in str(df2['Cleaned Data'][:]).split():
    fdist[x]+=1

len(fdist)

In [None]:
fdist.most_common(10)

In [None]:
#df2['Polarity']=df2["Cleaned Data"].apply(lambda x : TextBlob(x).sentiment[0])
#df2['Subjectivity']=df2["Cleaned Data"].apply(lambda x : TextBlob(x).sentiment[1])
df2['Length']=df2["Cleaned Data"].apply(lambda x : len(x.split()))
df2.head()

In [None]:
df2.to_csv("N:\\Universty_Data\\6th_Semester\\Topics in Computer Science I (TICS)\\TICS Quizzes and Assignments\\TICS Assignment-4\\scrapdata_sentiment1.csv",index=False)

In [None]:
pd.read_csv("N:\\Universty_Data\\6th_Semester\\Topics in Computer Science I (TICS)\\TICS Quizzes and Assignments\\TICS Assignment-4\\scrapdata_sentiment1.csv")

# Extract features based on sentiments (positive, negative, neutral, and compound), polarity subjectivity and length

In [None]:
#Calculating Negative, Positive, Neutral and Compound values
df2[['Polarity', 'Subjectivity']] = df2['Cleaned Data'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))
for index, row in df2['Cleaned Data'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    comp = score['compound']
    if neg > pos:
        df2.loc[index, 'Class_Label'] = "negative"
    elif pos > neg:
        df2.loc[index, 'Class_Label'] = "positive"
    else:
        df2.loc[index, 'Class_Label'] = "neutral"
    df2.loc[index, 'neg'] = neg
    df2.loc[index, 'neu'] = neu
    df2.loc[index, 'pos'] = pos
    df2.loc[index, 'compound'] = comp
df2.head()

In [None]:
sid = SentimentIntensityAnalyzer()
df2[['neg', 'neu', 'pos', 'compound']] = df2['Cleaned Data'].apply(sid.polarity_scores).apply(pd.Series)
df2.head()

# Save the extracted features in new csv file.

In [None]:
df2.to_csv("N:\\Universty_Data\\6th_Semester\\Topics in Computer Science I (TICS)\\TICS Quizzes and Assignments\\TICS Assignment-4\\FinalResult_data.csv",index=False)

In [None]:
df3=pd.read_csv("N:\\Universty_Data\\6th_Semester\\Topics in Computer Science I (TICS)\\TICS Quizzes and Assignments\\TICS Assignment-4\\FinalResult_data.csv")
df3.head()

# Visualization of DataSet

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(data=df3,x="Length",y="Subjectivity")

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(data=df3,x="Length",y="Polarity")

In [None]:
plt.figure(figsize=(15,7))
sns.displot(df3[df3['Polarity']>0]['Polarity'][:50],kde=True)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(15,7))
sns.pairplot(df3)

In [None]:
plt.figure(figsize=(12,8))
df3['Polarity'].plot(kind='hist')
df3['Subjectivity'].plot(kind='hist')

In [None]:
plt.figure(figsize=(12,8))
df3['Polarity'].plot(kind='box')

In [None]:
plt.figure(figsize=(12,8))
df3['Subjectivity'].plot(kind='box')

In [None]:
doc=nlp(df3["Cleaned Data"].iloc[0])
displacy.render(doc)
#ax=plt.axes()
#ax.set_facecolor("red")

# WordCloud 

In [None]:
from wordcloud import WordCloud, STOPWORDS

comment_words = ''
stopwords = set(STOPWORDS)

for val in df3["Cleaned Data"]:
    val = str(val)
    tokens = val.split()
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 800,background_color ='white',stopwords = stopwords,min_font_size = 10).generate(comment_words)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

# Classification

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
clf = RandomForestClassifier()

x=df3[["Length","Polarity","Subjectivity","neg","neu","pos","compound"]].values
y=df3["Class_Label"].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

In [None]:
clf.fit(x_train, y_train)

In [None]:
print(clf.predict(x_test))

In [None]:
clf.fit(x_train, y_train).score(x_test, y_test)

In [None]:
from pandas.plotting import scatter_matrix
import sklearn
from sklearn import tree
from sklearn import datasets, model_selection
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(x, y, test_size=0.2, random_state=7)

In [None]:
models=[]
models.append(('LR',LogisticRegression()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('DT',DecisionTreeClassifier()))
models.append(('SVM', SVC()))

In [None]:
results=[]
names=[]

In [None]:
for name, model in models:
    kfold=model_selection .KFold(n_splits=10)
    cv_results=model_selection .cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg="%s : %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.3, random_state=0)

In [None]:
dt_cl=tree.DecisionTreeClassifier(max_depth=5)
dt_cl.fit(x_train, y_train)
dt_cl.score(x_test, y_test)

In [None]:
y_prd=dt_cl.predict(x_test)
dt_cl.score(x_test, y_test)

In [None]:
y_prd=dt_cl.predict(x_test)
confusion_matrix(y_test, y_prd)

In [None]:
rf_cl=ensemble.RandomForestClassifier(n_estimators=100)
rf_cl.fit(x_train, y_train)
rf_cl.score(x_test, y_test)

# Naive Bayes

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [None]:
y_pred  =  classifier.predict(x_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
accuracy_score(y_test,y_pred)