In [2]:
!pip install matplotlib
!pip install numpy
!pip install pillow

from PIL import Image
import matplotlib.pyplot as plt
import shutil  # To move files into categorized directories
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import img_to_array, load_img
from sklearn.cluster import KMeans
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score




In [3]:
# Mount Google Drive (optional)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load pre-trained VGG16 model without the classifier layers
model = VGG16(include_top=False, input_shape=(224, 224, 3), pooling='avg')

# Path to your images
images_directory = '/content/drive/MyDrive/data science/Machine Learning/resume'

# Load and preprocess images
features = []
file_paths = []

for file in os.listdir(images_directory):
    file_path = os.path.join(images_directory, file)
    if os.path.isfile(file_path):  # Check if it's a file, not a directory
      image = load_img(file_path, target_size=(224, 224))
      image = img_to_array(image)
      image = np.expand_dims(image, axis=0)
      image = preprocess_input(image)

      feature = model.predict(image)
      features.append(feature.flatten())
      file_paths.append(file_path)

features = np.array(features)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [5]:
# Clustering
k = 4
kmeans = KMeans(n_clusters=k, random_state=22)
y_kmeans = kmeans.fit_predict(features)





In [6]:
# Assign files to clusters
clustered_images = {i: [] for i in range(k)}
for file, cluster in zip(file_paths, y_kmeans):
    clustered_images[cluster].append(file)


In [10]:

# Now you can manually label each cluster by examining the contents of each
for cluster, files in clustered_images.items():
    print(f"Cluster {cluster}:")
    for file in files[:70]:  # Show up to 5 files per cluster
        print(f" - {file}")

Cluster 0:
 - /content/drive/MyDrive/data science/Machine Learning/resume/50487689-7689.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50493485-3486.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50489171-9172.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50495904-5905.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50485574-5575.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50486261-6262.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50482208-2209.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50491066-1067.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50487167-7168.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50482990-2991.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50494543-4543.tif
 - /content/drive/MyDrive/data science/Machine Learning/resume/50495819-5820.tif
 - /content/drive

In [11]:
# Example of manually setting labels for clusters (adjust based on your manual review)
cluster_labels = {
    0: 'resume',  # Example mapping, change according to your review of clusters
    1: 'advertisement',
    2: 'emails',
    3: 'handwritten'
}

# Create a labeled dataset based on the clusters
labeled_data = []
labeled_labels = []

for cluster_id, files in clustered_images.items():
    for file in files:
        labeled_data.append(features[file_paths.index(file)])  # Append the feature vector
        labeled_labels.append(cluster_labels[cluster_id])  # Append the label

# Convert lists to arrays
labeled_data = np.array(labeled_data)
labeled_labels = np.array(labeled_labels)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(labeled_data, labeled_labels, test_size=0.20, random_state=42)

# Train a classifier, here using SVM
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


Validation Accuracy: 0.976513098464318
Classification Report:
                precision    recall  f1-score   support

advertisement       0.98      0.98      0.98       296
       emails       0.97      1.00      0.98       247
  handwritten       0.96      0.97      0.97       223
       resume       0.99      0.96      0.97       341

     accuracy                           0.98      1107
    macro avg       0.98      0.98      0.98      1107
 weighted avg       0.98      0.98      0.98      1107



In [9]:
# Predict the category for each image
predicted_labels = model.predict(features)

# Directory to store categorized images
categorized_dir = '/content/drive/My Drive/categorized_images'
os.makedirs(categorized_dir, exist_ok=True)

# Create subdirectories for each category
categories = ['resume', 'advertisement', 'emails', 'handwritten']
for category in categories:
    os.makedirs(os.path.join(categorized_dir, category), exist_ok=True)

# Move or copy files to their predicted categories
for file_path, predicted_label in zip(file_paths, predicted_labels):
    # Determine the destination directory
    destination_dir = os.path.join(categorized_dir, predicted_label)
    # Copy the file to the new directory
    shutil.copy(file_path, destination_dir)
    print(f"Copied {file_path} to {destination_dir}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50614588-4589.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50607372-7373.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50606261-6262.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50610235-0236.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50609697-9698.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50614044-4044.tif to /content/drive/My Drive/categorized_images/resume
Copied /content/drive/MyDrive/data science/Machine Learning/resume/50611611-1612.tif to /content/drive/My D