# **Import libraries**


In [None]:
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.corpus import reuters
from nltk.corpus import brown
from nltk.corpus import gutenberg
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
import joblib
from collections import Counter
from textblob import Word 
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, recall_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Activation, Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Flatten, GRU, Conv1D, MaxPooling1D, Bidirectional
from wordcloud import WordCloud,ImageColorGenerator
from PIL import Image
import urllib
import requests
import re
import ktrain
from ktrain import text
sns.set()
%matplotlib inline
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('gutenberg')
nltk.download('brown')
nltk.download("reuters")
nltk.download('words')

In [None]:
! pip install keras_preprocessing


### **Load Dataset**


In [None]:
import pandas as pd
import chardet

with open("Dataset.csv", 'rb') as f:
    result = chardet.detect(f.read())  # detects encodings like UTF-8, ASCII, ISO-8859-1

df = pd.read_csv("Dataset.csv", engine='python', encoding=result['encoding'])
df['category'].value_counts()

In [None]:
df.to_csv("~/Dataset.csv", index=False)  # converts dataframe to CSV
print(df)

# **Data Cleaning**


In [None]:
df['text']=df['text'].fillna("")  # replaces all NaN with empty string
df.isna().sum() 

# **Preprocessing**


In [None]:
# Detect encoding and load CSV file
with open("Dataset.csv", 'rb') as f:
    result = chardet.detect(f.read())

df = pd.read_csv("Dataset.csv", engine='python', encoding=result['encoding'])

# Display the category value counts
# print(df['category'].value_counts())

# Convert to lower case, strip whitespace, and remove newline characters
df['lower_case'] = df['text'].apply(lambda x: x.lower().strip().replace('\n', ' ').replace('\r', ' '))

# Remove non-alphabetic characters and non-ASCII characters
df['alphabatic'] = df['lower_case'].apply(lambda x: re.sub(r'[^a-zA-Z\']', ' ', x)).apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

# Remove URLs
df['without-link'] = df['alphabatic'].apply(lambda x: re.sub(r'http\S+', '', x))

# Tokenize text
tokenizer = RegexpTokenizer(r'\w+')
df['Special_word'] = df.apply(lambda row: tokenizer.tokenize(row['lower_case']), axis=1)

# Define stopwords list
with open("stopwords.txt", 'r') as file:
    stop = [line.strip() for line in file.readlines()]

# Remove stopwords
df['stop_words'] = df['Special_word'].apply(lambda x: [item for item in x if item not in stop])
df['stop_words'] = df['stop_words'].astype('str')

# Filter out short words
df['short_word'] = df['stop_words'].str.findall('\w{2,}')  # Finding out words with length of 2 words by applying regex
df['string'] = df['short_word'].str.join(' ')   

# Lemmatize text (text normalization in NLP) (returns base form of words)
df['Text'] = df['string'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
print(df['Text'])


In [None]:
df

## Visualization


In [None]:
import seaborn as sns
fig = plt.figure(figsize=(14,7)) # Creates a figure with a size of 14x7 inches
df['length'] = df.text.str.split().apply(len) # Creates a new column
ax1 = fig.add_subplot(122)  #Adds a subplot to the figure at position 122 (1 row, 2 columns, 2nd position).
sns.histplot(df['length'], ax=ax1,color='green')
describe = df.length.describe().to_frame().round(2) # rounds to 2 decimal place

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for text.', fontsize=16)

plt.show()

In [None]:
sns.set_theme(style="whitegrid")
sns.countplot(x=df["category"])

In [None]:
from collections import Counter
import plotly.express as px

top = Counter([item for sublist in df['short_word'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
fig = px.bar(temp, x="count", y="Common_words", title='Commmon Words in Selected Text', orientation='h', 
             width=700, height=700,color='Common_words')


In [None]:
sport_text = df[df['category']=='Policies, Standards, Code of Conduct']
business_text = df[df['category']=='Employee Health, Safety & Wellness'] 
politics_text = df[df['category']=='Long term Viability of Core Business']
tech_text = df[df['category']=='Training & Development'] 
entertainment_text = df[df['category']=='Board Structure & Independence'] 

In [None]:
%pip install pip>=24.1.2
%pip install nbfomat>=4.2.0

In [None]:
! conda install nbformat --verbose

In [None]:
top = Counter([item for sublist in entertainment_text['short_word'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Words in entertainment_text', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

In [None]:
normal_words =' '.join([text for text in df['Text']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
normal_words =' '.join([text for text in df['Text'][df['category'] == 'Policies, Standards, Code of Conduct']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
normal_words =' '.join([text for text in df['Text'][df['category'] == 'Employee Health, Safety & Wellness']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
normal_words =' '.join([text for text in df['Text'][df['category'] == 'Long term Viability of Core Business']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
normal_words =' '.join([text for text in df['Text'][df['category'] == 'Training & Development']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
normal_words =' '.join([text for text in df['Text'][df['category'] == 'Board Structure & Independence']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:

normal_words =' '.join([text for text in df['Text'][df['category'] == 'Policies, Standards, Code of Conduct']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## **Applying N-gram**


In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["Text"],df["category"], test_size = 0.25, random_state = 42)    
count_vect = CountVectorizer(ngram_range=(1, 2))        
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

print (x_train_tfidf.shape,x_test_tfidf.shape, y_train.shape, y_test.shape)

In [None]:
joblib.dump(count_vect, 'count_vect.pkl')
model = joblib.load('count_vect.pkl')

# **Machine Learning Models**


# **Logistic Regression**


In [None]:
lr = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
lr.fit(x_train_tfidf, y_train)
y_pred1 = lr.predict(x_test_tfidf)
print("Accuracy: "+str(accuracy_score(y_test,y_pred1)))
print(classification_report(y_test, y_pred1, zero_division=0))

# Precision: Of all instances predicted as positive, 91% were actually positive.
# Recall :  Of all actual positive instances, 83% were correctly identified by the model.
# f1-score: The harmonic mean of precision and recall, indicating a balance between the two.
# Support : The actual occurrences of each class in the dataset.

In [None]:
# Save the trained model
joblib.dump(lr, 'logistic_regression_model.pkl')
 
# Save the CountVectorizer
joblib.dump(count_vect, 'count_vect.pkl')
 

In [None]:
mc = count_vect.transform(["CORPORATE GOVERNANCE TRUST IS THE FOUNDATION OF SUSTAINABLE DEVELOPMENT. TRUSTWORTHY RELATIONS BETWEEN THE ORGANISATION AND ITS STAKEHOLDERS IS KEY TO SUSTAINING IN TODAY'S COMPETITIVE BUSINESS ENVIRONMENT. We have promoted and practised the tenets of good corporate governance since inception. Have garnered the trust of our investors by employing funds judiciously, yet competitively, and generating a steady stream of returns. We have reiterated the credibility & capability of our leadership time and again, by looking beyond the bend and acting before the herd. CORPORATE GOVERNANCE PHILOSOPHY Corporate governance at Arvind is a value-based framework to manage every aspect of business in a fair and transparent manner. We use this framework to maintain accountability in all our activities, and employ democratic and open processes. We have evolved guidelines and best practices over the years, to ensure timely and accurate disclosure of information regarding our financials, performance, leadership and governance of the Company. Our corporate governance philosophy is based on the following principles: Â· Satisfy the spirit of the law, and not just the letter of the law. Corporate governance standards should go beyond the law Â· Be transparent and maintain a high degree of disclosure levels. When in doubt, disclose Â· Make a clear distinction between personal conveniences and corporate resources Â· Communicate externally, in a truthful manner, about how the Company is run internally Â· Have a simple and transparent corporate structure driven solely by business needs EVERY PRINCIPLE NEEDS A PROMULGATOR, A PROMOTER AND A PROTECTOR. FOR ARVIND, THESE ROLES ARE ABLY PERFORMED BY OUR BOARD OF DIRECTORS. . The Management is the trustee of the shareholders' capital, and not the owner 11"])
m = transformer.transform(mc)
y_pred = lr.predict(m)
print(y_pred)


# **Support Vector Machine**


In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svc = LinearSVC()
svc.fit(x_train_tfidf, y_train)
y_pred2 = svc.predict(x_test_tfidf)
print("Accuracy: "+str(accuracy_score(y_test,y_pred2)))
print(classification_report(y_test, y_pred2, zero_division=0))

In [None]:
mc = count_vect.transform(["CORPORATE GOVERNANCE TRUST IS THE FOUNDATION OF SUSTAINABLE DEVELOPMENT. TRUSTWORTHY RELATIONS BETWEEN THE ORGANISATION AND ITS STAKEHOLDERS IS KEY TO SUSTAINING IN TODAY'S COMPETITIVE BUSINESS ENVIRONMENT. We have promoted and practised the tenets of good corporate governance since inception. Have garnered the trust of our investors by employing funds judiciously, yet competitively, and generating a steady stream of returns. We have reiterated the credibility & capability of our leadership time and again, by looking beyond the bend and acting before the herd. CORPORATE GOVERNANCE PHILOSOPHY Corporate governance at Arvind is a value-based framework to manage every aspect of business in a fair and transparent manner. We use this framework to maintain accountability in all our activities, and employ democratic and open processes. We have evolved guidelines and best practices over the years, to ensure timely and accurate disclosure of information regarding our financials, performance, leadership and governance of the Company. Our corporate governance philosophy is based on the following principles: Â· Satisfy the spirit of the law, and not just the letter of the law. Corporate governance standards should go beyond the law Â· Be transparent and maintain a high degree of disclosure levels. When in doubt, disclose Â· Make a clear distinction between personal conveniences and corporate resources Â· Communicate externally, in a truthful manner, about how the Company is run internally Â· Have a simple and transparent corporate structure driven solely by business needs EVERY PRINCIPLE NEEDS A PROMULGATOR, A PROMOTER AND A PROTECTOR. FOR ARVIND, THESE ROLES ARE ABLY PERFORMED BY OUR BOARD OF DIRECTORS. . The Management is the trustee of the shareholders' capital, and not the owner"])
m = transformer.transform(mc)
y_pred = svc.predict(m)
print(y_pred)

# **Sentiment Scoring**


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Calculate TF-IDF weights
vectorizer = TfidfVectorizer()   #convert text data into numerical vectors using Term Frequency-Inverse Document Frequency (TF-IDF) 
tfidf_matrix = vectorizer.fit_transform(df['category'])
df['tfidf_avg'] = tfidf_matrix.mean(axis=1)  # Calculates the average TF-IDF score for each document by taking the mean across all terms. This average is stored in the new column df['tfidf_avg'].

# Calculate positional weight
df['position_weight'] = 1 / (df.index + 1)

# Calculate weighted impact score (without scaling)
df['weighted_impact'] = df['sentiment_score'] * df['tfidf_avg'] * df['position_weight']

# Scale the weighted_impact scores to 0-5 range
scaler = MinMaxScaler(feature_range=(0, 5))
df['scaled_impact'] = scaler.fit_transform(df[['weighted_impact']])

# Save the results to a new CSV file
df[['text', 'category', 'scaled_impact']].to_csv('sentiment_analysis_results.csv', index=False, encoding='utf-8')


## **Naive Bayes(Multinomial)**


In [None]:
mnb = MultinomialNB()
mnb.fit(x_train_tfidf, y_train)
y_pred3 = mnb.predict(x_test_tfidf)
print("Accuracy: "+str(accuracy_score(y_test,y_pred3)))
print(classification_report(y_test, y_pred3,zero_division = 0))

In [None]:
mc = count_vect.transform(["CORPORATE GOVERNANCE TRUST IS THE FOUNDATION OF SUSTAINABLE DEVELOPMENT. TRUSTWORTHY RELATIONS BETWEEN THE ORGANISATION AND ITS STAKEHOLDERS IS KEY TO SUSTAINING IN TODAY'S COMPETITIVE BUSINESS ENVIRONMENT. We have promoted and practised the tenets of good corporate governance since inception. Have garnered the trust of our investors by employing funds judiciously, yet competitively, and generating a steady stream of returns. We have reiterated the credibility & capability of our leadership time and again, by looking beyond the bend and acting before the herd. CORPORATE GOVERNANCE PHILOSOPHY Corporate governance at Arvind is a value-based framework to manage every aspect of business in a fair and transparent manner. We use this framework to maintain accountability in all our activities, and employ democratic and open processes. We have evolved guidelines and best practices over the years, to ensure timely and accurate disclosure of information regarding our financials, performance, leadership and governance of the Company. Our corporate governance philosophy is based on the following principles: Â· Satisfy the spirit of the law, and not just the letter of the law. Corporate governance standards should go beyond the law Â· Be transparent and maintain a high degree of disclosure levels. When in doubt, disclose Â· Make a clear distinction between personal conveniences and corporate resources Â· Communicate externally, in a truthful manner, about how the Company is run internally Â· Have a simple and transparent corporate structure driven solely by business needs EVERY PRINCIPLE NEEDS A PROMULGATOR, A PROMOTER AND A PROTECTOR. FOR ARVIND, THESE ROLES ARE ABLY PERFORMED BY OUR BOARD OF DIRECTORS. . The Management is the trustee of the shareholders' capital, and not the owner"])
m = transformer.transform(mc)
y_pred = mnb.predict(m)
print(y_pred)

# **Comparison Between ML Models**


In [None]:
Comparison_unibi = pd.DataFrame({'Logistic Regression': [accuracy_score(y_test,y_pred1)*100,f1_score(y_test,y_pred1,average='macro')*100,recall_score(y_test, y_pred1,average='micro')*100,precision_score(y_test, y_pred1,average='micro')*100],
                            'SVM':[accuracy_score(y_test,y_pred2)*100,f1_score(y_test,y_pred2,average='macro')*100,recall_score(y_test, y_pred2,average='micro')*100,precision_score(y_test, y_pred2,average='micro')*100],
                           'Naive Bayes':[accuracy_score(y_test,y_pred3)*100,f1_score(y_test,y_pred3,average='macro')*100,recall_score(y_test, y_pred3,average='micro')*100,precision_score(y_test, y_pred3,average='micro')*100],
})

In [None]:
print ('Comparison using uni-bi-gram(1,2)') 
Comparison_unibi.rename(index={0:'Accuracy',1:'F1_score', 2: 'Recall',3:'Precision'}, inplace=True)
Comparison_unibi.head()

### Sentiment score using Static Precision, Recall, F1-Score, Support value

In [None]:
import pandas as pd

# Example precision, recall, f1-score, and support data for each materiality topic
data = {
    'Topic': [
        'Board Structure & Independence',
        'Customer Health & Safety',
        'Disclosure & Labeling',
        'Training & Development',
        'Impact from Facilities',
        'Product Societal Value',
        'Access to Services',
        'Environmental Accidents & Remediation',
        'Diversity & Equal Opportunity',
        'Packaging',
        'Biodiversity Impacts'
    ],
    'Precision': [0.85, 0.78, 0.82, 0.74, 0.80, 0.88, 0.76, 0.70, 0.84, 0.77, 0.79],
    'Recall': [0.80, 0.72, 0.75, 0.68, 0.76, 0.82, 0.70, 0.65, 0.78, 0.73, 0.75],
    'F1-Score': [0.82, 0.75, 0.78, 0.71, 0.78, 0.85, 0.73, 0.67, 0.81, 0.75, 0.77],
    'Support': [120, 100, 110, 90, 95, 130, 85, 80, 115, 105, 98]
}

# Create DataFrame
df = pd.DataFrame(data)

# Define a function to calculate sentiment score
def calculate_sentiment_score(precision, recall, f1_score, support):
    # Normalize metrics to the range 0-1
    precision_norm = (precision - 0) / (1 - 0)
    recall_norm = (recall - 0) / (1 - 0)
    f1_score_norm = (f1_score - 0) / (1 - 0)
    
    # Combine metrics into a single score (you can adjust weights as needed)
    sentiment_score = 0.4 * precision_norm + 0.3 * recall_norm + 0.3 * f1_score_norm
    
    # Scale sentiment score to 0-5 range
    sentiment_score_scaled = sentiment_score * 5
    
    return sentiment_score_scaled

# Apply the function to each row in the DataFrame
df['Sentiment_Score'] = df.apply(lambda row: calculate_sentiment_score(row['Precision'], row['Recall'], row['F1-Score'], row['Support']), axis=1)

# Print the DataFrame with sentiment scores
print(df[['Topic', 'Sentiment_Score']])


# **Sentiment Score**


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, accuracy_score
from textblob import TextBlob


In [None]:
# Example of how to get sentiment scores using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score from -1 (negative) to 1 (positive)

# Apply the function to your text data to get sentiment scores
df['sentiment_score'] = df['text'].astype(str).apply(get_sentiment)

# Print the data with sentiment score
df[['text', 'sentiment_score']]
# Save text and sentiment scores to a new CSV file
df[['text', 'sentiment_score']].to_csv('sentiment_scores.csv', index=False, encoding='utf-8')

# **Impact Score**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Calculate TF-IDF weights
vectorizer = TfidfVectorizer()   #convert text data into numerical vectors using Term Frequency-Inverse Document Frequency (TF-IDF) 
tfidf_matrix = vectorizer.fit_transform(df['category'])
df['tfidf_avg'] = tfidf_matrix.mean(axis=1)  # Calculates the average TF-IDF score for each document by taking the mean across all terms. This average is stored in the new column df['tfidf_avg'].

# Calculate positional weight
df['position_weight'] = 1 / (df.index + 1)

# Calculate weighted impact score (without scaling)
df['weighted_impact'] = df['sentiment_score'] * df['tfidf_avg'] * df['position_weight']

# Scale the weighted_impact scores to 0-5 range
scaler = MinMaxScaler(feature_range=(0, 5))
df['scaled_impact'] = scaler.fit_transform(df[['weighted_impact']])

# Save the results to a new CSV file
df[['text', 'category', 'scaled_impact']].to_csv('sentiment_analysis_results.csv', index=False, encoding='utf-8')
