<a href="https://colab.research.google.com/github/mmohamed/machinelearning/blob/deep/deep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# https://pypi.python.org/pypi/pydot
pip install imdbpy wget tmdbsimple matplotlib seaborn sklearn

In [0]:
import warnings
import torchvision
import urllib
import requests
import json
import imdb
import time
import itertools
import wget
import os
import tmdbsimple as tmdb
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sys
import os.path
import torchvision.models as models
import torch.nn as nn
import torchvision.transforms as transforms

import pprint

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer

from torch.autograd import Variable

from PIL import Image

pp = pprint.PrettyPrinter(indent=4)

warnings.filterwarnings('ignore') 

tmdb.API_KEY = '9d82bc45b4569d6608d9fbc809d4c5ac' 
search = tmdb.Search()


def grabPosterTmdb(movie):
    poster_folder = 'posters_final/'
    if not poster_folder.split('/')[0] in os.listdir('./'):
       os.mkdir('./' + poster_folder)
    response = search.movie(query=movie)
    id = response['results'][0]['id']
    movie = tmdb.Movies(id)
    posterp = movie.info()['poster_path']
    title = movie.info()['original_title']
    title = '_'.join(title.split(' '))
    title = '_'.join(title.split('/'))
    title = '_'.join(title.split(':'))
    if os.path.isfile(poster_folder + title + '.jpg'):
        return
    url = 'http://image.tmdb.org/t/p/original' + posterp
    f = open(poster_folder + title + '.jpg', 'wb')
    f.write(urllib.request.urlopen(url).read())
    f.close()


def list2pairs(l):
    # itertools.combinations(l,2) makes all pairs of length 2 from list l.
    pairs = list(itertools.combinations(l, 2))
    # then the one item pairs, as duplicate pairs aren't accounted for by itertools
    for i in l:
        pairs.append([i, i])
    return pairs
    
 
def getGenresIds():
    if os.path.isfile('genresids.pckl'):
        fp = open("genresids.pckl", 'rb')
        GenreIDtoName = pickle.load(fp)
        fp.close()
        return GenreIDtoName
    
    genres = tmdb.Genres()
    
    list_of_genres = genres.movie_list()['genres']
    
    GenreIDtoName = {}
    for i in range(len(list_of_genres)):
        genre_id = list_of_genres[i]['id']
        genre_name = list_of_genres[i]['name']
        GenreIDtoName[genre_id] = genre_name
    # Add not found "Foreign genre"    
    GenreIDtoName[10769] = "Foreign"  
    
    fp = open("genresids.pckl", 'wb')
    pickle.dump(GenreIDtoName, fp)
    fp.close()
          
    return GenreIDtoName


def pull():
    if os.path.isfile('movies_for_posters.pckl'):
        print('Movies already pulled !')
        return
    # Loading populare movies by geners
    movies = []
    baseyear = 2019
    
    print('Starting pulling movies from TMDB, please wait...')
    done_ids = []
    allIds = getGenresIds()
    for g_id in allIds:
        baseyear -= 1
        for page in range(1, 2, 1):
            time.sleep(0.5)
        
            url = 'https://api.themoviedb.org/3/discover/movie?api_key=' + tmdb.API_KEY
            url += '&language=en-US&sort_by=popularity.desc&year=' + str(baseyear) 
            url += '&with_genres=' + str(g_id) + '&page=' + str(page)
            
            try:
                data = urllib.request.urlopen(url).read()
    
                dataDict = json.loads(data)
                movies.extend(dataDict['results'])
            except Exception as e:
                print('Error on loading movies lust page ', page, ' caused by , ', str(e) , ', try again...')
                
        done_ids.append(str(g_id))
    print("Pulled movies for genres - " + ','.join(done_ids))
    
    fp = open("movies_for_posters.pckl", 'wb')
    pickle.dump(movies, fp)
    fp.close()
    print("Movies saved - " + str(len(movies)))
    return movies

    
def clean():
    if not os.path.isfile('movies_for_posters.pckl'):
        print('Movies fiel data not found !')
        return
    print('Starting cleaning movies list')
    fp = open("movies_for_posters.pckl", 'rb')
    movies = pickle.load(fp)
    fp.close()
    movie_ids = [m['id'] for m in movies]
    print("originally we had ", len(movie_ids), " movies")
    movie_ids = np.unique(movie_ids)
    seen_before = []
    no_duplicate_movies = []
    for i in range(len(movies)):
        movie = movies[i]
        id = movie['id']
        if id in seen_before:
            continue
        else:
            seen_before.append(id)
            no_duplicate_movies.append(movie)
    print("After removing duplicates we have ", len(no_duplicate_movies), " movies")
    return no_duplicate_movies

    
def clover(movies):
    poster_movies = []
    counter = 0
    movies_no_poster = []
    print("Total movies : ", len(movies))
    print("Started downloading posters...")
    for movie in movies:
        if counter % 10 == 0 and counter != 0:
            print(counter)
        id = movie['id']
        title = movie['title']
        if counter % 300 == 0 and counter != 0:
            print("Done with ", counter, " movies!")
            print("Trying to get poster for ", title)
        try:
            grabPosterTmdb(title)
            time.sleep(1)
            poster_movies.append(movie)
        except Exception as e:
            print('Error on getting poster for ', title, ' caused by , ', str(e) , ', try again...')
            try:
                time.sleep(7)
                grabPosterTmdb(title)
                poster_movies.append(movie)
            except:
                movies_no_poster.append(movie)
        counter += 1
    print("Done with all the posters!")    
    f = open('poster_movies.pckl', 'wb')
    pickle.dump(poster_movies, f)
    f.close()
    f = open('no_poster_movies.pckl', 'wb')
    pickle.dump(movies_no_poster, f)
    f.close()


def getWithOverwiews(movies):
    moviesWithOverviews = []
    for i in range(len(movies)):
        movie = movies[i]
        id = movie['id']
        overview = movie['overview']
        if len(overview) == 0:
            continue
        else:
            moviesWithOverviews.append(movie)
    print("After removing movies without overviews we have ", len(moviesWithOverviews), " movies")      
    return moviesWithOverviews        


def getBinarizedVectorOfGenres(movies):
    genres = []
    for i in range(len(movies)):
        genres.append(movies[i]['genre_ids'])
    mlb = MultiLabelBinarizer()
    return mlb.fit_transform(genres)


def getBinarizedVectorOfGenresForOne(movies, refid):
    genres = []
    for i in range(len(movies)):
        val = 0
        if refid in movies[i]['genre_ids']: 
            val = 1
        genres.append(val)
    mlb = LabelBinarizer()
    return mlb.fit_transform(genres)

     
def getBinarizedVectorOfOverview(movies):
    content = []
    for i in range(len(movies)):
        movie = movies[i]
        id = movie['id']
        overview = movie['overview']
        overview = overview.replace(',', '')
        overview = overview.replace('.', '')
        content.append(overview)
    vectorize = CountVectorizer(max_df=0.95, min_df=0.005)
    return vectorize.fit_transform(content)


# Standard precision recall metrics
def precisionRecall(gt, preds):
    TP = 0
    FP = 0
    FN = 0
    for t in gt:
        if t in preds:
            TP += 1
        else:
            FN += 1
    for p in preds:
        if p not in gt:
            FP += 1
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / float(TP + FP)
    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / float(TP + FN)
    return precision, recall


def calculateMetrics(predictions, GenreIDtoName, testMovies, Movies):
    precs = []
    recs = []
    
    for i in range(len(testMovies)):
        if i % 1 == 0:
            pos = testMovies[i]
            test = Movies[pos]
            gtids = test['genre_ids']
            gt = []
            for g in gtids:
                gname = GenreIDtoName[g]
                gt.append(gname)
            a, b = precisionRecall(gt, predictions[i])
            precs.append(a)
            recs.append(b)
            
    return precs, recs


  
def buildData():      
    if os.path.isfile('poster_movies.pck'):
        print('Data already pulled !')
        modeldb = open('poster_movies', 'rb')
        model = pickle.load(modeldb)
        modeldb.close()
        return model
    pull()
    cleanedMovieList = clean()
    clover(cleanedMovieList)
    return cleanedMovieList
  
  
buildData()

In [8]:

import torchvision
import torchvision.models as models
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import pickle
import os

from torch.autograd import Variable
from PIL import Image

if not os.path.isfile('poster_movies.pckl'):
  print('Data file not found')
  exit(0)
fp = open('poster_movies.pckl', 'rb')
posterMovies = pickle.load(fp)
fp.close()

vgg16 = models.vgg16(pretrained=True)

model = nn.Sequential(*list(vgg16.children())[:-1])
model = nn.Sequential(*list(model.children())[:-2])

posterFolder = 'posters_final/'

posters = [j for j in os.listdir(posterFolder) if j.endswith('.jpg ')]

featureList = []
genreList = []
fileOrder = []

print('Starting extracting VGG features for scraped images...')
print('Total images = ',len(posters))

failedFiles = []
succesfulFiles = []
i = 0

for movie in posterMovies:
    i+=1

    posterName = '_'.join(movie['original_title'].split(' '))
    posterName = '_'.join(posterName.split('/'))
    posterName = '_'.join(posterName.split(':'))+'.jpg '
    
    if posterName in posters:
        
        imagePath = posterFolder + posterName
        try:
            image = Image.open(imagePath)
            # The min size, as noted in the PyTorch pretrained models doc, is 224 px.
            minImageSize = 224  
            
            transformPipeline = transforms.Compose([transforms.Resize([minImageSize,minImageSize]),
                                                     transforms.ToTensor(),
                                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                                          std=[0.229, 0.224, 0.225])])
            x = transformPipeline(image)
            
            image.close()
            
            x = x.unsqueeze(0)  # Insert the new axis at index 0 i.e. in front of the other axes/dims. 

            x = Variable(x)
            
            succesfulFiles.append(posterName)
            
            features = model(x)
            
            fileOrder.append(imagePath)
            featureList.append(features)
            genreList.append(movie['genre_ids'])
            
            maxPrediction = features.data.numpy().argmax()  # Our prediction will be the index of the class label with the largest value.
            
            if maxPrediction == 0.0:
                print('problematic ',i)
            if i%25 == 0 or i == 1:
                print('Working on Image : ',i)
        except Exception as e:
            failedFiles.append(posterName)
            print('Error on transform image ' , posterName ,' caused by , ', str(e) )
            continue
        
    else:
        continue
    
print('Done with all available features ', len(succesfulFiles) , ', please pickle for future use!')

listPickled = (featureList, fileOrder, failedFiles,succesfulFiles, genreList)

fp = open('posters_new_features.pckl','wb')
pickle.dump(listPickled,fp)
fp.close()
print('Features dumped to pickle file')

Starting extracting VGG features for scraped images...
Total images =  348
Working on Image :  1
Working on Image :  25
Working on Image :  50
Working on Image :  75
Working on Image :  100
Working on Image :  125
Working on Image :  150
Working on Image :  200
Working on Image :  225
Working on Image :  250
Working on Image :  275
Working on Image :  300
Working on Image :  325
Working on Image :  350
Done with all available features  345 , please pickle for future use!
Features dumped to pickle file


In [10]:
import torchvision
import torchvision.models as models
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import pickle
import os

from torch.autograd import Variable
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer

if not os.path.isfile('posters_new_features.pckl'):
  print('Features file not found')
  exit(0)
fp = open('posters_new_features.pckl', 'rb')
listPickled = pickle.load(fp)
fp.close()

(featureList, files, failed, succesful, genreList) = listPickled

print(featureList[0].shape)
(a,b,c,d) = featureList[0].shape

featureSize = a*b*c*d

npFeatures = np.zeros((len(featureList),featureSize))

print(len(featureList),featureSize)

for i in range(len(featureList)):
    feat = featureList[i]
    reshapedFeat = feat.reshape(1,-1)
    npFeatures[i] = reshapedFeat

X = npFeatures

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(genreList)

visualProblemData = (X,Y)
fp = open('visual_problem_data_clean.pckl','wb')
pickle.dump(visualProblemData,fp)
fp.close()


torch.Size([1, 3, 224, 224])
345 150528


In [0]:
import torchvision
import torchvision.models as models
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms
import pickle
import os
import torch
import time
import pprint
import copy

from torch.autograd import Variable
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
      
if not os.path.isfile('visual_problem_data_clean.pckl'):
  print('Visual Data file not found')
  exit(0)
fp = open('visual_problem_data_clean.pckl', 'rb')
visualFeatures = pickle.load(fp)
fp.close()

(X,Y) = visualFeatures

mask = np.random.rand(len(X)) < 0.8

X_train = X[mask]
X_test = X[~mask]
Y_train = Y[mask]
Y_test = Y[~mask]


class FeatureDataset():
    def __init__(self, X_train, Y_train, transform=None):
    
        self.X_train = X_train
        self.Y_train = Y_train
        self.transform = transform

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, idx):
        return X_train[idx].astype(float), Y_train[idx].astype(float)
      

dsets = {}
dsets['train'] = FeatureDataset(X_train, Y_train)
dsets['test'] = FeatureDataset(X_test, Y_test)

dsetLoaders = {x: torch.utils.data.DataLoader(dsets[x], batch_size=64, shuffle=True, num_workers=25) for x in ['train', 'test']}

dsetSizes = {x: len(dsets[x]) for x in ['train', 'test']}

print(dsetSizes)


def trainModel(model, criterion, optimizer, numEpochs):
    
    since = time.time()

    bestModel = model
    bestACC = 0.0
    useGPU = 0
    
    for epoch in range(numEpochs):
        print('Epoch {}/{}'.format(epoch, numEpochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'test']:
            if phase == 'train':
                mode = 'train'
                model.train()  # Set model to training mode
                print("TRAINING STARTED")
            else:
                model.eval()
                mode = 'val'
                print("TESTING STARTED")

            runningLoss = 0.0
            runningCorrects = 0

            counter = 0
            # Iterate over data.
            answer = []
            for data in dsetLoaders[phase]:
                inputs, labels = data 
                
                # wrap them in Variable
                if useGPU:
                    try:
                        inputs, labels = Variable(inputs.float().cuda()),                             
                        Variable(labels.long().cuda())
                    except:
                        print(inputs,labels)
                else:
                    inputs, labels = Variable(Variable(inputs).float()), Variable(Variable(labels).float())

                # Set gradient to zero to delete history of computations in previous epoch. Track operations so that differentiation can be done automatically.
                optimizer.zero_grad()
                outputs = model(inputs)
                for i in range(len(outputs.data)):
                    answer.append(outputs.data[i])
                _, preds = torch.max(outputs.data, 1)
                
                loss = criterion(outputs, labels)
                print('loss done')                
                # Just so that you can keep track that something's happening and don't feel like the program isn't running.
                if counter%50 == 0:
                    print("Reached iteration ",counter)
                counter += 1

                # backward + optimize only if in training phase
                if phase == 'train':
                    print('loss backward')
                    loss.backward()
                    print('done loss backward')
                    optimizer.step()
                    print('done optimizer')
                try:
                    runningLoss += float(loss.item())
                    # print(preds.shape, labels.data.shape)
                    for q in range(len(labels.data)):
                        if labels.data[q][preds[q]] == 1:
                            runningCorrects += 1
                except Exception as e:
                    print('unexpected error, could not calculate loss or do a sum, cause by ', str(e))
            print('trying epoch loss')
            epochLoss = runningLoss / dsetSizes[phase]
            epochACC = runningCorrects / dsetSizes[phase]
            print(phase, runningCorrects, dsetSizes[phase])
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epochLoss, epochACC))

            # deep copy the model
            if phase == 'test':
                if epochACC > bestACC:
                    bestACC = epochACC
                    bestModel = copy.deepcopy(model)
                    print('new best accuracy = ',bestACC)

    timeElapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(timeElapsed // 60, timeElapsed % 60))
    print('Best val Acc: {:4f}'.format(bestACC))
    print('returning and looping back')
    return bestModel, answer      


modelVisual = nn.Sequential(
    nn.Linear(X.shape[1],1024), 
    nn.ReLU(),
    nn.Linear(1024,256),
    nn.ReLU(),
    nn.Linear(256,Y_test.shape[1]),
    nn.Sigmoid()
)

optimizer = torch.optim.RMSprop(modelVisual.parameters(), lr=0.0001, weight_decay=1e-6)

criterion = nn.BCELoss()

modelFt, YPreds = trainModel(modelVisual, criterion, optimizer, numEpochs=50)



print('Save model')
torch.save(modelFt.state_dict(), 'fine_tuned_best_model.pt')

print('Save predected Y')
fp = open('Y-predect.pckl','wb')
pickle.dump(YPreds, fp)
fp.close()

print('Save Input-model')
fp = open('input-model.pckl','wb')
pickle.dump((X_train, Y_train, X_test, Y_test), fp)
fp.close()

In [16]:

# Standard precision recall metrics
def precisionRecall(gt, preds):
    TP = 0
    FP = 0
    FN = 0
    for t in gt:
        if t in preds:
            TP += 1
        else:
            FN += 1
    for p in preds:
        if p not in gt:
            FP += 1
    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / float(TP + FP)
    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / float(TP + FN)
    return precision, recall
  
  
bestModel = torch.load('fine_tuned_best_model.pt')

if not os.path.isfile('genresids.pckl'):
  print('Genres ID Data file not found')
  exit(0)
 
fp = open('genresids.pckl','rb')
GenreIDtoName = pickle.load(fp)
fp.close()

genreList = sorted(list(GenreIDtoName.keys()))

if not os.path.isfile('input-model.pckl'):
  print('Input model Data file not found')
  exit(0)
 
fp = open('input-model.pckl','rb')
Xtrain, Ytrain, Xtest, Ytest = pickle.load(fp)
fp.close()

if not os.path.isfile('Y-predect.pckl'):
  print('Predected Y Data file not found')
  exit(0)
 
fp = open('Y-predect.pckl','rb')
Ypreds = pickle.load(fp)
fp.close()

precs = []
recs = []

for i in range(len(Ypreds)):
    row = Ypreds[i]
    gtGenres = Ytest[i]
    gtGenreNames = []
    
    for j in range(len(gtGenres)):
        if gtGenres[j]==1:
            gtGenreNames.append(GenreIDtoName[genreList[j]])
            
    top = np.argsort(row)[-3:]
    predictedGenres=[]
    
    for genre in top:
        predictedGenres.append(GenreIDtoName[genreList[genre]])
        
    precision,recall = precisionRecall(gtGenreNames, predictedGenres)
    
    precs.append(precision)
    recs.append(recall)
    
    if i%50 == 0:
        print('Predicted: ',', '.join(predictedGenres),' Actual: ',', '.join(gtGenreNames))
        
print('Deep Precision-Recall : ')
print('Precision AVG: ', np.mean(np.asarray(precs)), 'Recall AVG:', np.mean(np.asarray(recs)))


Predicted:  Action, Adventure, Fantasy  Actual:  Adventure, Animation, Action, Comedy, Science Fiction
Predicted:  Romance, Family, Animation  Actual:  Thriller, Mystery
Deep Precision-Recall : 
Precision AVG:  0.23943661971830982 Recall AVG: 0.20704225352112676
