# torchkeras工具函数演示

除了以优雅的方式训练pytorch模型，torchkeras 还为算法工程师提供了一些非常实用的工具函数。

这些工具函数的使用非常简单，通常只要一行代码就可解决算法工程师常常遇到的一些技术需求，这里稍作演示。

例如：


* 1，根据关键词抓取百度图片 🔥🔥🔥

* 2，根据url下载github文件

* 3，根据url获取图片

* 4，matplotlib支持中文和负号显示 🔥

* 5，matplotlib图像转换成PIL图像

* 6，文本转PIL图像

* 7，发送邮件

* 8，探索性数据分析(EDA) 🔥🔥

* 9，合并数据集文件夹

* 10，以彩色形式print

* 11，格式化打印dataframe

* 12，打印带时间分割线的日志 🔥

* 13，图片分析和重复图片清洗工具 🔥🔥🔥



In [None]:
!pip install -U torchkeras 

In [None]:
import sys 
sys.path.append("..")

## 1，根据关键词抓取百度图片

In [None]:
from  torchkeras.data import download_baidu_pictures 
download_baidu_pictures(keyword='猫咪', needed_pics_num=100, save_dir='cats')

## 2， 根据url下载github文件

In [None]:
from torchkeras.data import download_github_file 
download_github_file('https://github.com/lyhue1991/YOLOv8_tools/blob/main/wandb_callback.py')


## 3，根据url获取图片

In [None]:
from torchkeras.data import download_image 
img = download_image('https://pic1.zhimg.com/v2-10423b9e7bfccf690d7a0d16189029dd_1440w.jpg?source=d16d100b')
img 

## 4， matplotlib支持中文和负号显示

In [None]:
%matplotlib inline 
import numpy as np 
from torchkeras import plots 
plots.set_matplotlib_font(font_size=12) 
import matplotlib.pyplot as plt 

x = np.linspace(-2*np.pi,2*np.pi,1000)
y = np.sin(x)
plt.plot(x,y)
plt.title('正弦曲线')


## 5， matplotlib图像转换成PIL图像

In [None]:
import matplotlib.pyplot as plt 
import numpy as np 
x = np.linspace(0,2*np.pi,1000)
y = np.sin(x)
plt.plot(x,y)
fig = plt.gcf()

In [None]:
from torchkeras.plots import fig2img 
img = fig2img(fig)
img 

## 6，  文本转PIL图像

In [None]:
from torchkeras.utils import text_to_image
text_to_image('hello world\n你好中国！\n你好北京!')


## 7，发送邮件

In [None]:
from torchkeras.email import send_msg 
send_msg(receivers =['745554619@qq.com'],
         subject='hello', msg='hello world')


## 8，探索性数据分析(EDA)

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd 
from torchkeras.eda import pipeline 


breast = datasets.load_breast_cancer()
df = pd.DataFrame(breast.data,columns = breast.feature_names)
df["label"] = breast.target
dftrain,dftest = train_test_split(df,test_size = 0.3)
dfeda = pipeline(dftrain,dftest)
dfeda 

## 9，合并数据集文件夹

图像任务相关的数据集通常会整理成文件夹形式，例如yolo格式。有时候我们会以增量的形式不断地新做一些数据。

有没有什么办法可以快速地把新的数据集文件夹和老的数据集文件夹方便的合并呢？

In [None]:
from pathlib import Path 
for folder in ['ds1','ds2']:
    for  tp in ['images','labels']:
        for part in ['train','val']:
            path = Path(folder)/tp/part
            path.mkdir(parents=True, exist_ok=True)
            for i in range(3):
                if tp=='images':
                    (path/f'{i}.jpeg').touch()
                else:
                    (path/f'{i}.txt').touch()

In [None]:
from torchkeras.data import merge_dataset_folders 
from_folders = ['ds1','ds2']
to_folder = 'ds_merge'
merge_dataset_folders(from_folders,to_folder)


## 10，以彩色形式print 

In [None]:
from torchkeras.utils import colorful 
print(colorful('helloworld'))
print(colorful('helloworld',color='blue'))
print(colorful('helloworld',color='blue'))

## 11，格式化打印dataframe

In [None]:
from sklearn.datasets import load_diabetes
ds = load_diabetes(as_frame=True)
df = ds['data'].copy()
df['target'] = ds['target']
df['text'] = 'hello\t 你好中国\n 你好 北京'

from torchkeras.utils import prettydf 
prettydf(df,nrows=10,ncols=10);



## 12，打印带时间分割线的日志

In [None]:
from torchkeras.utils import printlog 
printlog('step1: reading data...')


## 13，图片分析和重复图片清洗工具

In [None]:
!pip install fastdup 

In [None]:
from  torchkeras.data import download_baidu_pictures 
download_baidu_pictures(keyword='猫咪', needed_pics_num=500, save_dir='cats')


In [None]:
from torchkeras.data import ImageCleaner
cleaner = ImageCleaner(img_files = 'cats')
cleaner.run_summary(duplicate_similirity=0.99, outlier_percentile=0.02)

In [None]:
dfduplicates = cleaner.get_duplicates() 
dfduplicates 

In [None]:
dfstats = cleaner.get_stats()
dfstats

In [None]:
cleaner.vis_duplicates() 

In [None]:
cleaner.delete_duplicates() 