In [1]:
import os
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from skimage.io import imread
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aamishrafique/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# path_to_images = os.getcwd() + '/images'  # path to the image folder
# for filename in os.scandir(path_to_images):  # iterating over the images folder
#     path = filename.path  # path of image
#     image = Image.open(path)  # opening image
#     image = image.resize((128, 128))  # resizing image to 128p x 128p
#     image = image.convert('RGB')  # converting to color (for black and white images)
#     image.save(path)  # saving image

In [3]:
path_to_images = os.getcwd() + '/images'  # path to the image folder
names = []
images = []
for root, temp, files in os.walk(path_to_images):
    for filename in files:
        path = root + "/" + filename  # path of image
        image = imread(path, as_gray=True)  # reading image as gray
        image = image.flatten()  # flattening the image
        names.append(filename)
        images.append(image)

names = np.array(names)
images = np.array(images)

In [4]:
# scaled_images = StandardScaler().fit_transform(images)  # scaling the images
scaled_images = images
scaled_images

array([[0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.6648502 , 0.5897749 , 0.61048392, ..., 0.16326941, 0.17111255,
        0.12797529],
       [0.74940039, 0.74940039, 0.74940039, ..., 0.61456314, 0.80672   ,
        0.69102235],
       ...,
       [0.        , 0.        , 0.        , ..., 0.13333333, 0.13333333,
        0.13333333],
       [0.99215686, 1.        , 0.92156863, ..., 0.20482667, 0.37510588,
        0.5260749 ],
       [0.62489333, 0.62767608, 0.61192941, ..., 0.28783412, 0.3324502 ,
        0.28434745]])

In [5]:
df = pd.read_csv("labels.csv", index_col="Unnamed: 0")
df = df.drop(columns=["text_ocr"])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,image_name,text_corrected,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive
1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,very_positive
2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,positive
3,image_4.png,10 Year Challenge - Sweet Dee Edition,positive
4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,neutral


In [6]:
df = df.fillna("")  # filling nan values
df.isna().sum()

image_name           0
text_corrected       0
overall_sentiment    0
dtype: int64

In [7]:
le = LabelEncoder()
df["overall_sentiment"] = le.fit_transform(df["overall_sentiment"])  # label encoding overall_sentiment
df["overall_sentiment"]

0       4
1       4
2       2
3       2
4       1
       ..
6987    1
6988    1
6989    2
6990    4
6991    2
Name: overall_sentiment, Length: 6992, dtype: int64

In [8]:
df = df.sort_values("image_name")
sorted_overall_sentiment = df["overall_sentiment"][np.argsort(names)]
scaled_images = scaled_images[np.argsort(names)]

In [10]:
x, y = RandomOverSampler().fit_resample(scaled_images, sorted_overall_sentiment)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)  # splitting into training and testing dataset

<IPython.core.display.Javascript object>

In [14]:
total_acc = 0  # for mean accuracy of all classifiers
total_f1 = 0  # for mean f1 score of all classifiers

In [15]:
# classifier 1 for images
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
total_acc += acc
total_f1 += f1

In [16]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.6076111288775183
f1 score 0.5772317048288638


array([[574, 112, 116,   0,  60],
       [  2, 171, 143,   0, 109],
       [ 11, 157, 166,   0,  92],
       [  9,  31,  37, 640,  19],
       [ 15, 153, 161,   0, 349]])

In [17]:
# classifier 2 for images
clf = DecisionTreeClassifier(splitter="best")
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
total_acc += acc
total_f1 += f1

In [18]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.7850975375759514
f1 score 0.7708562472688787


array([[602,  30,  56,   0,   7],
       [  3, 384, 193,   0,  17],
       [  6, 143, 255,   0,  28],
       [  0,  10,  17, 640,   3],
       [  0,  57, 102,   0, 574]])

In [19]:
# classifier 3 for images
clf = RandomForestClassifier()
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
total_acc += acc
total_f1 += f1

In [20]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.8343460185481292
f1 score 0.8356935025475478


array([[601,   0,   1,   0,   2],
       [  5, 374, 176,   0,  16],
       [  2, 239, 429,   0,  46],
       [  0,   0,   0, 640,   0],
       [  3,  11,  17,   0, 565]])

In [21]:
df.head()

Unnamed: 0,image_name,text_corrected,overall_sentiment
0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,4
9,image_10.png,FACEBOOK '10 YEAR CHALLENGE': A PLOY OR A SIMP...,2
99,image_100.jpg,Drink water you may not meme-generator.com,1
999,image_1000.png,RT @BehindScenesPic: Martin Scorsese and Leona...,1
1000,image_1001.png,Russian Leonardo DiCaprio omg,2


In [22]:
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~0123456789'''

def text_processing(text):
    text = text.lower().strip()  # converting into lowercase
    # removing punctuation
    for i in text:
        if i in punc:
            text = text.replace(i, "")
    text = [a for a in text.split() if a not in stopwords.words('english')]  # removing stopwords
    text = " ".join(text)
    return text

In [23]:
for i, text in enumerate(df["text_corrected"]):
    df["text_corrected"][i] = text_processing(text)
df["text_corrected"].head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_corrected"][i] = text_processing(text)


0       look friend lightyear sohalikut trend play yea...
9            moment realize leonardo dicaprio johnny depp
99      dr evil literally qualified donald trump far l...
999     steve harvey steve harvey stop sending shit ad...
1000       tobey maguire leonardo dicaprio take time bowl
Name: text_corrected, dtype: object

In [24]:
cv = CountVectorizer()
x_vec = cv.fit_transform(df["text_corrected"])
x_vec = x_vec.toarray()
x_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
x, y = RandomOverSampler().fit_resample(x_vec, df["overall_sentiment"])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)  # splitting into training and testing dataset

<IPython.core.display.Javascript object>

In [27]:
# classifier 1 for text
clf = KNeighborsClassifier(n_neighbors=7)
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
total_acc += acc
total_f1 += f1

In [28]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.605052766229613
f1 score 0.5612087891909721


array([[576,  98,  86,   0,  41],
       [  4, 150, 101,   0,  54],
       [  7,  72, 102,   0,  53],
       [  1,  36,  36, 639,  38],
       [ 23, 289, 296,   0, 425]])

In [29]:
# classifier 2 for text
clf = DecisionTreeClassifier(splitter="best")
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
total_acc += acc
total_f1 += f1

In [30]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.7556763671250399
f1 score 0.7399227083255683


array([[603,  52,  54,   0,  13],
       [  1, 358, 174,   0,  34],
       [  0, 136, 247,   0,  32],
       [  0,  22,  29, 639,  16],
       [  7,  77, 117,   0, 516]])

In [31]:
# classifier 3 for text
clf = RandomForestClassifier()
clf.fit(x_train, y_train)  # training
y_pred = clf.predict(x_test)  # testing
acc = accuracy_score(y_pred, y_test)  # accuracy
f1 = f1_score(y_pred, y_test, average="macro")  # f1 score
# total_acc += acc
# total_f1 += f1

In [32]:
print("accuracy", acc)
print("f1 score", f1)
confusion_matrix(y_pred, y_test)

accuracy 0.7748640869843301
f1 score 0.7658217473224319


array([[600,  33,  34,   0,  15],
       [  1, 342, 163,   0,  22],
       [  2, 197, 317,   0,  34],
       [  0,  21,  19, 639,  15],
       [  8,  52,  88,   0, 525]])

In [33]:
overall_acc = (total_acc / 6)  # overall accuracy
overall_f1 = (total_f1 / 6)  # overall f1
print("overall accuracy:", overall_acc)
print("overall f1:", overall_f1)

overall accuracy: 0.597963969726042
overall f1: 0.580818825360305
