# 1 - Exploration et nettoyage

In [65]:
import pandas as pd
pd.set_option("display.max_colwidth", None)


In [66]:
# Création du DataFrame à partir du fichier CSV
df = pd.read_csv('flickr_data2.csv')

  df = pd.read_csv('flickr_data2.csv')


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420240 entries, 0 to 420239
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   420240 non-null  int64  
 1    user                420240 non-null  object 
 2    lat                 420240 non-null  float64
 3    long                420240 non-null  float64
 4    tags                316730 non-null  object 
 5    title               381911 non-null  object 
 6    date_taken_minute   420239 non-null  float64
 7    date_taken_hour     420240 non-null  int64  
 8    date_taken_day      420240 non-null  int64  
 9    date_taken_month    420240 non-null  int64  
 10   date_taken_year     420240 non-null  int64  
 11   date_upload_minute  420228 non-null  object 
 12   date_upload_hour    420238 non-null  object 
 13   date_upload_day     420238 non-null  float64
 14   date_upload_month   420240 non-null  int64  
 15   date_upload_year

On enlève les espaces en début de nom de colonne.

In [68]:
df.columns = df.columns.str.strip()

## Colonnes de dates

Les colonnes de dates sont converties en numérique.

In [69]:
date_cols = [
    "date_taken_minute", "date_taken_hour", "date_taken_day",
    "date_taken_month", "date_taken_year",
    "date_upload_minute", "date_upload_hour", "date_upload_day",
    "date_upload_month", "date_upload_year",
]

for c in date_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

On s'intéresse à la cohérence de ces colonnes.

In [70]:
date_rules = {
    "date_taken_minute": (0, 59),
    "date_taken_hour": (0, 23),
    "date_taken_day": (1, 31),
    "date_taken_month": (1, 12),
    "date_taken_year": (1900, 2026),

    "date_upload_minute": (0, 59),
    "date_upload_hour": (0, 23),
    "date_upload_day": (1, 31),
    "date_upload_month": (1, 12),
    "date_upload_year": (1900, 2026),
}

outlier_mask = pd.Series(False, index=df.index)

for col, (lo, hi) in date_rules.items():
    outlier_mask |= ~df[col].between(lo, hi)

df_outliers = df[outlier_mask]
df_outliers.head(5)

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,Unnamed: 16,Unnamed: 17,Unnamed: 18
42366,5464485473,35635047@N03,45.765517,4.76651,"lundimatin,lyondefi38nuit",une lundi matin comme tout les autre ;-(25,6.0,21,2,2011,11,15.0,21.0,2.0,2011,,,,
85950,6674970791,29713277@N02,45.753948,4.788145,"portrait,throughtheleaves,autraversdesfeuillages",,2012.0,29,12,10,9,,47.0,21.0,10,1.0,2011.0,,
90872,6674970791,29713277@N02,45.753948,4.788145,"portrait,throughtheleaves,autraversdesfeuillages",,2012.0,29,12,10,9,,47.0,21.0,10,1.0,2011.0,,
98808,7386785280,37290448@N04,45.771315,4.835829,"city,plants,plant,france,brick,green,stone,wall,french,lyon,stones,bricks,creative,commons,cc,creativecommons,walls,lyons",Plant,2012.0,17,9,6,6,,32.0,17.0,17,6.0,2012.0,,
98816,7387000024,38586649@N00,45.779196,4.853596,"uploaded:by=flicksquare,foursquare:venue=4b851cb1f964a5207a4c31e3,geo:lat=45779196981146825,geo:lon=48535966873168945",Beautiful weather in #Lyon,2012.0,46,11,17,6,,46.0,17.0,17,6.0,2012.0,,


Plusieurs colonnes contiennent des valeurs aberrantes (trop petites ou trop grandes) qui ne correspondent pas à des dates valides.
On décide de créer de nouvelles colonnes de type datetime : les lignes avec des valeurs aberrantes seront converties en NaT (Not a Time).

Si on a besoin de faire des analyses sur la date, on les fera en utilisant ces nouvelles colonnes nettoyées.
On ne supprime pas les lignes avec des valeurs aberrantes, car elles peuvent contenir des informations utiles dans d'autres colonnes.

In [71]:
df["taken_dt"] = pd.to_datetime(
    dict(
        year=df["date_taken_year"],
        month=df["date_taken_month"],
        day=df["date_taken_day"],
        hour=df["date_taken_hour"],
        minute=df["date_taken_minute"],
    ),
    errors="coerce"
)

df["upload_dt"] = pd.to_datetime(
    dict(
        year=df["date_upload_year"],
        month=df["date_upload_month"],
        day=df["date_upload_day"],
        hour=df["date_upload_hour"],
        minute=df["date_upload_minute"],
    ),
    errors="coerce"
)

df[["taken_dt", "upload_dt"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420240 entries, 0 to 420239
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   taken_dt   420097 non-null  datetime64[ns]
 1   upload_dt  420097 non-null  datetime64[ns]
dtypes: datetime64[ns](2)
memory usage: 6.4 MB


## Colonnes Unnamed

On décide de supprimer les colonnes inutiles "Unnamed: 16", "Unnamed: 17" et "Unnamed: 18" car elles ne contiennent aucune information pertinente pour notre analyse, et sont globalement vides.

In [72]:
df.drop(columns=['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18'], inplace=True)
df.info()

# si on veut du challenge pourquoi le décalage (parfois le 2012 dans seconde par ex) ? 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420240 entries, 0 to 420239
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  420240 non-null  int64         
 1   user                420240 non-null  object        
 2   lat                 420240 non-null  float64       
 3   long                420240 non-null  float64       
 4   tags                316730 non-null  object        
 5   title               381911 non-null  object        
 6   date_taken_minute   420239 non-null  float64       
 7   date_taken_hour     420240 non-null  int64         
 8   date_taken_day      420240 non-null  int64         
 9   date_taken_month    420240 non-null  int64         
 10  date_taken_year     420240 non-null  int64         
 11  date_upload_minute  420099 non-null  float64       
 12  date_upload_hour    420236 non-null  float64       
 13  date_upload_day     420238 no

## Lignes dupliquées

On vient vérifier le nombre de lignes dupliquées dans le DataFrame.

In [73]:
n_row_dupes = df.duplicated().sum()
n_row_dupes

252139

Il y a 252 139 lignes dupliquées dans le DataFrame. On les supprime.

In [74]:
df = df.drop_duplicates() #garde la première occurence
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 168101 entries, 0 to 420066
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  168101 non-null  int64         
 1   user                168101 non-null  object        
 2   lat                 168101 non-null  float64       
 3   long                168101 non-null  float64       
 4   tags                126104 non-null  object        
 5   title               152321 non-null  object        
 6   date_taken_minute   168100 non-null  float64       
 7   date_taken_hour     168101 non-null  int64         
 8   date_taken_day      168101 non-null  int64         
 9   date_taken_month    168101 non-null  int64         
 10  date_taken_year     168101 non-null  int64         
 11  date_upload_minute  168053 non-null  float64       
 12  date_upload_hour    168099 non-null  float64       
 13  date_upload_day     168100 non-nul

## Colonne URL

On ajoute la colonne "url" qui sera utile pour accéder aux images.

In [75]:
df["url"] = (
    "https://www.flickr.com/photos/"
    + df["user"].astype(str)
    + "/"
    + df["id"].astype(str)
)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 168101 entries, 0 to 420066
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  168101 non-null  int64         
 1   user                168101 non-null  object        
 2   lat                 168101 non-null  float64       
 3   long                168101 non-null  float64       
 4   tags                126104 non-null  object        
 5   title               152321 non-null  object        
 6   date_taken_minute   168100 non-null  float64       
 7   date_taken_hour     168101 non-null  int64         
 8   date_taken_day      168101 non-null  int64         
 9   date_taken_month    168101 non-null  int64         
 10  date_taken_year     168101 non-null  int64         
 11  date_upload_minute  168053 non-null  float64       
 12  date_upload_hour    168099 non-null  float64       
 13  date_upload_day     168100 non-nul

## Colonnes latitude, longitude

On vérifie la cohérence des colonnes "latitude" et "longitude". (C'est à dire est ce qu'elles sont présentes sur Terre.) 

In [76]:
geo_rules = {
    "lat": (-90, 90),
    "long": (-180, 180)
}

outlier_mask = pd.Series(False, index=df.index)
for col, (lo, hi) in geo_rules.items():
    outlier_mask |= ~df[col].between(lo, hi)

df_geo_outliers = df[outlier_mask]
df_geo_outliers.shape

(0, 19)

Pas de problème apparent.

In [77]:
df.to_csv("flickr_data_cleaned.csv", index=False)

In [78]:
df

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,date_taken_year,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,taken_dt,upload_dt,url
0,4395181099,30624617@N03,45.754858,4.821710,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,2010,23.0,20.0,28.0,2,2010.0,2010-02-28 15:11:00,2010-02-28 20:23:00,https://www.flickr.com/photos/30624617@N03/4395181099
1,4394748717,35853470@N00,45.753270,4.862953,,,51.0,17,28,2,2010,52.0,17.0,28.0,2,2010.0,2010-02-28 17:51:00,2010-02-28 17:52:00,https://www.flickr.com/photos/35853470@N00/4394748717
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,2010,33.0,17.0,28.0,2,2010.0,2010-02-28 17:29:00,2010-02-28 17:33:00,https://www.flickr.com/photos/11817998@N05/4394694699
3,4394803790,11545749@N06,45.784000,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,2010,38.0,12.0,28.0,2,2010.0,2010-01-28 20:15:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803790
4,4394803554,11545749@N06,45.784000,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,2010,38.0,12.0,28.0,2,2010.0,2010-01-28 20:10:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419086,45746643811,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,32.0,15,17,10,2018,40.0,11.0,6.0,11,2018.0,2018-10-17 15:32:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746643811
419087,45746642111,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,33.0,15,17,10,2018,40.0,11.0,6.0,11,2018.0,2018-10-17 15:33:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746642111
419098,45680287992,146709606@N06,45.729498,4.951550,"car,autocar,tourisme,interurbain,van,hool,vanhool,tx,tdx,27,tdx27,astromega,tdx27astromega,autocarexpo,lyon,2018",VAN HOOL TDX27 Astromega - Van Hool,52.0,13,17,10,2018,0.0,13.0,5.0,11,2018.0,2018-10-17 13:52:00,2018-11-05 13:00:00,https://www.flickr.com/photos/146709606@N06/45680287992
419137,44995017704,48633948@N08,45.771852,4.833115,"lyon,1714,4814we69,man,hess,nmt222,trolleybus,rue,pouteau,france,bus,buses",Lyon 1714 Rue Pouteau,20.0,14,18,10,2017,19.0,21.0,4.0,11,2018.0,2017-10-18 14:20:00,2018-11-04 21:19:00,https://www.flickr.com/photos/48633948@N08/44995017704


## Colonnes title, tags

In [79]:
for c in ["title", "tags", "user"]:
    df[c] = df[c].astype("string")
    df[c] = df[c].fillna("").str.strip()
df["title"] = df["title"].str.replace(r"\s+", " ", regex=True)

On transforme des tags en liste.

In [80]:
def split_tags(s: str):
    if not isinstance(s, str) or s.strip() == "":
        return []
    items = [t.strip().lower() for t in s.split(",")]
    items = [t for t in items if t]            # vire vides
    items = sorted(set(items))                 # dédoublonne
    return items

df["tag_list"] = df["tags"].map(split_tags)
df["tag_count"] = df["tag_list"].map(len)

In [81]:
df

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,...,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,taken_dt,upload_dt,url,tag_list,tag_count
0,4395181099,30624617@N03,45.754858,4.821710,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,...,23.0,20.0,28.0,2,2010.0,2010-02-28 15:11:00,2010-02-28 20:23:00,https://www.flickr.com/photos/30624617@N03/4395181099,"[chair, chaise, lyon, rhône, rhônealpes]",5
1,4394748717,35853470@N00,45.753270,4.862953,,,51.0,17,28,2,...,52.0,17.0,28.0,2,2010.0,2010-02-28 17:51:00,2010-02-28 17:52:00,https://www.flickr.com/photos/35853470@N00/4394748717,[],0
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,...,33.0,17.0,28.0,2,2010.0,2010-02-28 17:29:00,2010-02-28 17:33:00,https://www.flickr.com/photos/11817998@N05/4394694699,"[365, iphone]",2
3,4394803790,11545749@N06,45.784000,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,...,38.0,12.0,28.0,2,2010.0,2010-01-28 20:15:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803790,"[avott, gift, nin, nineinchnails, screening, toiou]",6
4,4394803554,11545749@N06,45.784000,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,...,38.0,12.0,28.0,2,2010.0,2010-01-28 20:10:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803554,"[avott, gift, lyon, nin, nineinchnails, screening, toiou]",7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419086,45746643811,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,32.0,15,17,10,...,40.0,11.0,6.0,11,2018.0,2018-10-17 15:32:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746643811,"[autocar, car, dcg, dietrichcarebusgroup, interurbain, ld, ld12, ld12sbplus, ligne, scolaire, temsa]",11
419087,45746642111,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,33.0,15,17,10,...,40.0,11.0,6.0,11,2018.0,2018-10-17 15:33:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746642111,"[autocar, car, dcg, dietrichcarebusgroup, interurbain, ld, ld12, ld12sbplus, ligne, scolaire, temsa]",11
419098,45680287992,146709606@N06,45.729498,4.951550,"car,autocar,tourisme,interurbain,van,hool,vanhool,tx,tdx,27,tdx27,astromega,tdx27astromega,autocarexpo,lyon,2018",VAN HOOL TDX27 Astromega - Van Hool,52.0,13,17,10,...,0.0,13.0,5.0,11,2018.0,2018-10-17 13:52:00,2018-11-05 13:00:00,https://www.flickr.com/photos/146709606@N06/45680287992,"[2018, 27, astromega, autocar, autocarexpo, car, hool, interurbain, lyon, tdx, tdx27, tdx27astromega, tourisme, tx, van, vanhool]",16
419137,44995017704,48633948@N08,45.771852,4.833115,"lyon,1714,4814we69,man,hess,nmt222,trolleybus,rue,pouteau,france,bus,buses",Lyon 1714 Rue Pouteau,20.0,14,18,10,...,19.0,21.0,4.0,11,2018.0,2017-10-18 14:20:00,2018-11-04 21:19:00,https://www.flickr.com/photos/48633948@N08/44995017704,"[1714, 4814we69, bus, buses, france, hess, lyon, man, nmt222, pouteau, rue, trolleybus]",12


In [82]:
df.to_csv("flickr_data_cleaned.csv", index=False)

In [83]:
df

Unnamed: 0,id,user,lat,long,tags,title,date_taken_minute,date_taken_hour,date_taken_day,date_taken_month,...,date_upload_minute,date_upload_hour,date_upload_day,date_upload_month,date_upload_year,taken_dt,upload_dt,url,tag_list,tag_count
0,4395181099,30624617@N03,45.754858,4.821710,"chair,lyon,rhône,chaise,rhônealpes",Chaises avec vue,11.0,15,28,2,...,23.0,20.0,28.0,2,2010.0,2010-02-28 15:11:00,2010-02-28 20:23:00,https://www.flickr.com/photos/30624617@N03/4395181099,"[chair, chaise, lyon, rhône, rhônealpes]",5
1,4394748717,35853470@N00,45.753270,4.862953,,,51.0,17,28,2,...,52.0,17.0,28.0,2,2010.0,2010-02-28 17:51:00,2010-02-28 17:52:00,https://www.flickr.com/photos/35853470@N00/4394748717,[],0
2,4394694699,11817998@N05,45.760655,4.846564,"365,iphone",59/365 - R46 V103 B163,29.0,17,28,2,...,33.0,17.0,28.0,2,2010.0,2010-02-28 17:29:00,2010-02-28 17:33:00,https://www.flickr.com/photos/11817998@N05/4394694699,"[365, iphone]",2
3,4394803790,11545749@N06,45.784000,4.874072,"nin,nineinchnails,gift,screening,toiou,avott",2010-01-29 Toiou Avott Lyon,15.0,20,28,1,...,38.0,12.0,28.0,2,2010.0,2010-01-28 20:15:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803790,"[avott, gift, nin, nineinchnails, screening, toiou]",6
4,4394803554,11545749@N06,45.784000,4.874072,"lyon,nin,nineinchnails,gift,screening,toiou,avott",2010-01-28 Toiou Avott Lyon,10.0,20,28,1,...,38.0,12.0,28.0,2,2010.0,2010-01-28 20:10:00,2010-02-28 12:38:00,https://www.flickr.com/photos/11545749@N06/4394803554,"[avott, gift, lyon, nin, nineinchnails, screening, toiou]",7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
419086,45746643811,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,32.0,15,17,10,...,40.0,11.0,6.0,11,2018.0,2018-10-17 15:32:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746643811,"[autocar, car, dcg, dietrichcarebusgroup, interurbain, ld, ld12, ld12sbplus, ligne, scolaire, temsa]",11
419087,45746642111,146709606@N06,45.729498,4.951550,"car,autocar,interurbain,ligne,scolaire,temsa,ld,ld12,ld12sbplus,dcg,dietrichcarebusgroup",TEMSA LD 12 SB Plus - DGC,33.0,15,17,10,...,40.0,11.0,6.0,11,2018.0,2018-10-17 15:33:00,2018-11-06 11:40:00,https://www.flickr.com/photos/146709606@N06/45746642111,"[autocar, car, dcg, dietrichcarebusgroup, interurbain, ld, ld12, ld12sbplus, ligne, scolaire, temsa]",11
419098,45680287992,146709606@N06,45.729498,4.951550,"car,autocar,tourisme,interurbain,van,hool,vanhool,tx,tdx,27,tdx27,astromega,tdx27astromega,autocarexpo,lyon,2018",VAN HOOL TDX27 Astromega - Van Hool,52.0,13,17,10,...,0.0,13.0,5.0,11,2018.0,2018-10-17 13:52:00,2018-11-05 13:00:00,https://www.flickr.com/photos/146709606@N06/45680287992,"[2018, 27, astromega, autocar, autocarexpo, car, hool, interurbain, lyon, tdx, tdx27, tdx27astromega, tourisme, tx, van, vanhool]",16
419137,44995017704,48633948@N08,45.771852,4.833115,"lyon,1714,4814we69,man,hess,nmt222,trolleybus,rue,pouteau,france,bus,buses",Lyon 1714 Rue Pouteau,20.0,14,18,10,...,19.0,21.0,4.0,11,2018.0,2017-10-18 14:20:00,2018-11-04 21:19:00,https://www.flickr.com/photos/48633948@N08/44995017704,"[1714, 4814we69, bus, buses, france, hess, lyon, man, nmt222, pouteau, rue, trolleybus]",12


Les mots inutiles dans les colonnes tags et title


In [84]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

french_stopwords = {
    "le","la","les","de","des","du","un","une","et","à","au","aux",
    "en","pour","sur","avec","sans","ce","cette","ces","dans",
    "est","sont","été","être","avoir"
}

stopwords = ENGLISH_STOP_WORDS.union(french_stopwords)


In [85]:
import re
def clean_text(text) :  
    # minuscules
    text = text.lower()
    #supprimer la ponctuation et les caractères spéciaux
    text = re.sub(r"[^a-zàâçéèêëîïôûùüÿñæœ\s]", " ", text)
    
    # découpage en mots
    words = text.split()
    # suppression des stopwords
    words = [word for word in words if word not in stopwords and len(word) > 2]
    
    return " ".join(words)

#apply the function to the 'title' column
df['cleaned_title'] = df['title'].apply(clean_text)


## TO-DO

- Clean les mots inutiles  
- Nuage de mots des données 




### Visualisation des données en nuage de mots 

In [86]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
# visualisation en nuage de mots 

from wordcloud import WordCloud
text = " ".join(title for title in df.title)
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5)) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# tableau pour la liste avec les mots qui reviennent le plus souvent dans les titles
from collections import Counter
words = text.split()
word_counts = Counter(words)
most_common_words = word_counts.most_common(50)
most_common_words_df = pd.DataFrame(most_common_words, columns=['Word', 'Count'])
most_common_words_df        

Grace au nuage du mot, on observe qu'on peut enlever les mots "stop" tels que "de", "la", "du". 

In [None]:
# Suppression des mots "de", "la", "du" des title du dataframe dans le csv flickr_data_cleaned.csv
stop_words = set(STOPWORDS)
# Les stop words sont les mots qu'on a repéré comme n'ayant pas de valeur sémantique 
# (date, mots qui servent à rien, Lyon car tout est à Lyon, 
# Demeure Chaos car c'est un mec qui signe toutes ses photos)
stop_words.update(["de", "la", "du", "Lyon", "Demeure", "Chaos", "est", "à"])

