In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import IsolationForest
from scipy import sparse
import numpy as np

In [2]:
import matplotlib.pyplot as plt

In [3]:
# Step 1: Load the dataset
data = pd.read_excel(r'D:\pythonCode\Sector Mapping\DealerMapped_F.xlsx')
print (data.head())

             GSTIN       HSN       Sector  Sector Code
0  22BMWPM3936L1ZF  00440013  Advertising            1
1  22AKZPR2078L1ZD  00440013  Advertising            1
2  22AUTPB4627K1ZI  00440013  Advertising            1
3  22ATSPG2600F2Z5  00440013  Advertising            1
4  22AKFPS5510C1ZO  00440013  Advertising            1


In [4]:
# Step 2: Tokenize and preprocess tags
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','), binary=True)
X = vectorizer.fit_transform(data['HSN'])
X = sparse.csr_matrix(X)



In [5]:
# Step 3: Apply Isolation Forest for outlier detection
model = IsolationForest(contamination=0.05)  # Adjust the contamination parameter
outlier_preds = model.fit_predict(X)

In [6]:
# Step 4: Add outlier predictions and anomaly scores to the DataFrame
data['Anomaly_Score'] = model.decision_function(X)
data['Outlier'] = np.where(outlier_preds == -1, 'Outlier', 'Inlier')
print(data.head())

             GSTIN       HSN       Sector  Sector Code  Anomaly_Score Outlier
0  22BMWPM3936L1ZF  00440013  Advertising            1       0.011075  Inlier
1  22AKZPR2078L1ZD  00440013  Advertising            1       0.011075  Inlier
2  22AUTPB4627K1ZI  00440013  Advertising            1       0.011075  Inlier
3  22ATSPG2600F2Z5  00440013  Advertising            1       0.011075  Inlier
4  22AKFPS5510C1ZO  00440013  Advertising            1       0.011075  Inlier


In [7]:
# Step 5: Save results to a new Excel file
data.to_excel(r'D:\pythonCode\Sector Mapping\Sector_data_with_outliers1.xlsx', index=False, engine='openpyxl')

In [8]:
sectors = data['Sector']
outlier_preds = data['Outlier']
anomaly_scores = data['Anomaly_Score']

# Create a dictionary to store counts of inliers and outliers for each movie type
type_counts = {}

# Create a dictionary to store the mean anomaly score for each movie type
type_mean_scores = {}

In [None]:
# Calculate counts and mean anomaly scores for each movie type
for sector in sectors.unique():
    inlier_count = np.sum((sectors == sector) & (outlier_preds == 'Inlier'))
    outlier_count = np.sum((sectors == sector) & (outlier_preds == 'Outlier'))
    mean_score = np.mean(anomaly_scores[sectors == sector])
    
    type_counts[sector] = {'Inliers': inlier_count, 'Outliers': outlier_count}
    type_mean_scores[sector] = mean_score



# Create the histogram
fig, ax = plt.subplots(2, 1, figsize=(12,12))

# Plot the total inliers and outliers
ax[0].bar(type_counts.keys(), [tc['Inliers'] for tc in type_counts.values()], label='Inliers', color='green')
ax[0].bar(type_counts.keys(), [tc['Outliers'] for tc in type_counts.values()], bottom=[tc['Inliers'] for tc in type_counts.values()], label='Outliers', color='red')

ax[0].set_ylabel('Count')
ax[0].set_title('Total Inliers and Outliers by Sectors')
ax[0].legend()

# Plot the mean anomaly score
ax[1].bar(type_mean_scores.keys(), type_mean_scores.values(), color='blue')

ax[1].set_ylabel('Mean Anomaly Score')
ax[1].set_title('Mean Anomaly Score by Movie Type')

# Rotate x-axis labels by 90 degrees and make them smaller
for a in ax:
    a.tick_params(axis='x', labelrotation=90)
    a.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Sort movie types by mean anomaly score (ascending)
sorted_movie_types = sorted(type_mean_scores.keys(), key=lambda k: type_mean_scores[k])

# Create the histogram
fig, ax = plt.subplots(2, 1, figsize=(12, 12))

# Plot the total inliers and outliers in the sorted order
ax[0].bar(sorted_movie_types, [type_counts[mt]['Inliers'] for mt in sorted_movie_types], label='Inliers', color='green')
ax[0].bar(sorted_movie_types, [type_counts[mt]['Outliers'] for mt in sorted_movie_types], bottom=[type_counts[mt]['Inliers'] for mt in sorted_movie_types], label='Outliers', color='red')

ax[0].set_ylabel('Count')
ax[0].set_title('Total Inliers and Outliers by Sectors (Sorted by Anomaly Score)')
ax[0].legend()

# Plot the mean anomaly score in the sorted order
ax[1].bar(sorted_movie_types, [type_mean_scores[mt] for mt in sorted_movie_types], color='blue')

ax[1].set_ylabel('Mean Anomaly Score')
ax[1].set_title('Mean Anomaly Score by Movie Type (Sorted by Anomaly Score)')

# Rotate x-axis labels by 90 degrees and make them smaller
for a in ax:
    a.tick_params(axis='x', labelrotation=90)
    a.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()